Set stdout to binary mode under Windows (fixes issue #218)
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         if sys.platform == 'win32':
98                                 import msvcrt
99                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
100                         return (sys.stdout, filename)
101                 stream = open(filename, open_mode)
102                 return (stream, filename)
103         except (IOError, OSError), err:
104                 # In case of error, try to remove win32 forbidden chars
105                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
106
107                 # An exception here should be caught in the caller
108                 stream = open(filename, open_mode)
109                 return (stream, filename)
110
111
112 class DownloadError(Exception):
113         """Download Error exception.
114         
115         This exception may be thrown by FileDownloader objects if they are not
116         configured to continue on errors. They will contain the appropriate
117         error message.
118         """
119         pass
120
121 class SameFileError(Exception):
122         """Same File exception.
123
124         This exception will be thrown by FileDownloader objects if they detect
125         multiple files would have to be downloaded to the same file on disk.
126         """
127         pass
128
129 class PostProcessingError(Exception):
130         """Post Processing exception.
131
132         This exception may be raised by PostProcessor's .run() method to
133         indicate an error in the postprocessing task.
134         """
135         pass
136
137 class UnavailableVideoError(Exception):
138         """Unavailable Format exception.
139
140         This exception will be thrown when a video is requested
141         in a format that is not available for that video.
142         """
143         pass
144
145 class ContentTooShortError(Exception):
146         """Content Too Short exception.
147
148         This exception may be raised by FileDownloader objects when a file they
149         download is too small for what the server announced first, indicating
150         the connection was probably interrupted.
151         """
152         # Both in bytes
153         downloaded = None
154         expected = None
155
156         def __init__(self, downloaded, expected):
157                 self.downloaded = downloaded
158                 self.expected = expected
159
160 class FileDownloader(object):
161         """File Downloader class.
162
163         File downloader objects are the ones responsible of downloading the
164         actual video file and writing it to disk if the user has requested
165         it, among some other tasks. In most cases there should be one per
166         program. As, given a video URL, the downloader doesn't know how to
167         extract all the needed information, task that InfoExtractors do, it
168         has to pass the URL to one of them.
169
170         For this, file downloader objects have a method that allows
171         InfoExtractors to be registered in a given order. When it is passed
172         a URL, the file downloader handles it to the first InfoExtractor it
173         finds that reports being able to handle it. The InfoExtractor extracts
174         all the information about the video or videos the URL refers to, and
175         asks the FileDownloader to process the video information, possibly
176         downloading the video.
177
178         File downloaders accept a lot of parameters. In order not to saturate
179         the object constructor with arguments, it receives a dictionary of
180         options instead. These options are available through the params
181         attribute for the InfoExtractors to use. The FileDownloader also
182         registers itself as the downloader in charge for the InfoExtractors
183         that are added to it, so this is a "mutual registration".
184
185         Available options:
186
187         username:       Username for authentication purposes.
188         password:       Password for authentication purposes.
189         usenetrc:       Use netrc for authentication instead.
190         quiet:          Do not print messages to stdout.
191         forceurl:       Force printing final URL.
192         forcetitle:     Force printing title.
193         simulate:       Do not download the video files.
194         format:         Video format code.
195         format_limit:   Highest quality format to try.
196         outtmpl:        Template for output names.
197         ignoreerrors:   Do not stop on download errors.
198         ratelimit:      Download speed limit, in bytes/sec.
199         nooverwrites:   Prevent overwriting files.
200         retries:        Number of times to retry for HTTP error 5xx
201         continuedl:     Try to continue downloads if possible.
202         noprogress:     Do not print the progress bar.
203         """
204
205         params = None
206         _ies = []
207         _pps = []
208         _download_retcode = None
209         _num_downloads = None
210
211         def __init__(self, params):
212                 """Create a FileDownloader object with the given options."""
213                 self._ies = []
214                 self._pps = []
215                 self._download_retcode = 0
216                 self._num_downloads = 0
217                 self.params = params
218         
219         @staticmethod
220         def pmkdir(filename):
221                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
222                 components = filename.split(os.sep)
223                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
224                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
225                 for dir in aggregate:
226                         if not os.path.exists(dir):
227                                 os.mkdir(dir)
228         
229         @staticmethod
230         def format_bytes(bytes):
231                 if bytes is None:
232                         return 'N/A'
233                 if type(bytes) is str:
234                         bytes = float(bytes)
235                 if bytes == 0.0:
236                         exponent = 0
237                 else:
238                         exponent = long(math.log(bytes, 1024.0))
239                 suffix = 'bkMGTPEZY'[exponent]
240                 converted = float(bytes) / float(1024**exponent)
241                 return '%.2f%s' % (converted, suffix)
242
243         @staticmethod
244         def calc_percent(byte_counter, data_len):
245                 if data_len is None:
246                         return '---.-%'
247                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
248
249         @staticmethod
250         def calc_eta(start, now, total, current):
251                 if total is None:
252                         return '--:--'
253                 dif = now - start
254                 if current == 0 or dif < 0.001: # One millisecond
255                         return '--:--'
256                 rate = float(current) / dif
257                 eta = long((float(total) - float(current)) / rate)
258                 (eta_mins, eta_secs) = divmod(eta, 60)
259                 if eta_mins > 99:
260                         return '--:--'
261                 return '%02d:%02d' % (eta_mins, eta_secs)
262
263         @staticmethod
264         def calc_speed(start, now, bytes):
265                 dif = now - start
266                 if bytes == 0 or dif < 0.001: # One millisecond
267                         return '%10s' % '---b/s'
268                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
269
270         @staticmethod
271         def best_block_size(elapsed_time, bytes):
272                 new_min = max(bytes / 2.0, 1.0)
273                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
274                 if elapsed_time < 0.001:
275                         return long(new_max)
276                 rate = bytes / elapsed_time
277                 if rate > new_max:
278                         return long(new_max)
279                 if rate < new_min:
280                         return long(new_min)
281                 return long(rate)
282
283         @staticmethod
284         def parse_bytes(bytestr):
285                 """Parse a string indicating a byte quantity into a long integer."""
286                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
287                 if matchobj is None:
288                         return None
289                 number = float(matchobj.group(1))
290                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
291                 return long(round(number * multiplier))
292
293         def add_info_extractor(self, ie):
294                 """Add an InfoExtractor object to the end of the list."""
295                 self._ies.append(ie)
296                 ie.set_downloader(self)
297         
298         def add_post_processor(self, pp):
299                 """Add a PostProcessor object to the end of the chain."""
300                 self._pps.append(pp)
301                 pp.set_downloader(self)
302         
303         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
304                 """Print message to stdout if not in quiet mode."""
305                 try:
306                         if not self.params.get('quiet', False):
307                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
308                         sys.stdout.flush()
309                 except (UnicodeEncodeError), err:
310                         if not ignore_encoding_errors:
311                                 raise
312         
313         def to_stderr(self, message):
314                 """Print message to stderr."""
315                 print >>sys.stderr, message.encode(preferredencoding())
316         
317         def fixed_template(self):
318                 """Checks if the output template is fixed."""
319                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
320
321         def trouble(self, message=None):
322                 """Determine action to take when a download problem appears.
323
324                 Depending on if the downloader has been configured to ignore
325                 download errors or not, this method may throw an exception or
326                 not when errors are found, after printing the message.
327                 """
328                 if message is not None:
329                         self.to_stderr(message)
330                 if not self.params.get('ignoreerrors', False):
331                         raise DownloadError(message)
332                 self._download_retcode = 1
333
334         def slow_down(self, start_time, byte_counter):
335                 """Sleep if the download speed is over the rate limit."""
336                 rate_limit = self.params.get('ratelimit', None)
337                 if rate_limit is None or byte_counter == 0:
338                         return
339                 now = time.time()
340                 elapsed = now - start_time
341                 if elapsed <= 0.0:
342                         return
343                 speed = float(byte_counter) / elapsed
344                 if speed > rate_limit:
345                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
346
347         def report_destination(self, filename):
348                 """Report destination filename."""
349                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
350         
351         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
352                 """Report download progress."""
353                 if self.params.get('noprogress', False):
354                         return
355                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
356                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
357
358         def report_resuming_byte(self, resume_len):
359                 """Report attempt to resume at given byte."""
360                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
361         
362         def report_retry(self, count, retries):
363                 """Report retry in case of HTTP error 5xx"""
364                 self.to_stdout(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
365         
366         def report_file_already_downloaded(self, file_name):
367                 """Report file has already been fully downloaded."""
368                 try:
369                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
370                 except (UnicodeEncodeError), err:
371                         self.to_stdout(u'[download] The file has already been downloaded')
372         
373         def report_unable_to_resume(self):
374                 """Report it was impossible to resume download."""
375                 self.to_stdout(u'[download] Unable to resume')
376         
377         def report_finish(self):
378                 """Report download finished."""
379                 if self.params.get('noprogress', False):
380                         self.to_stdout(u'[download] Download completed')
381                 else:
382                         self.to_stdout(u'')
383         
384         def increment_downloads(self):
385                 """Increment the ordinal that assigns a number to each file."""
386                 self._num_downloads += 1
387
388         def process_info(self, info_dict):
389                 """Process a single dictionary returned by an InfoExtractor."""
390                 # Do nothing else if in simulate mode
391                 if self.params.get('simulate', False):
392                         # Forced printings
393                         if self.params.get('forcetitle', False):
394                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
395                         if self.params.get('forceurl', False):
396                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
397                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
398                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
399                         if self.params.get('forcedescription', False) and 'description' in info_dict:
400                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
401
402                         return
403                         
404                 try:
405                         template_dict = dict(info_dict)
406                         template_dict['epoch'] = unicode(long(time.time()))
407                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
408                         filename = self.params['outtmpl'] % template_dict
409                 except (ValueError, KeyError), err:
410                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
411                         return
412                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
413                         self.to_stderr(u'WARNING: file exists and will be skipped')
414                         return
415
416                 try:
417                         self.pmkdir(filename)
418                 except (OSError, IOError), err:
419                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
420                         return
421
422                 try:
423                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
424                 except (OSError, IOError), err:
425                         raise UnavailableVideoError
426                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
427                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
428                         return
429                 except (ContentTooShortError, ), err:
430                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
431                         return
432
433                 if success:
434                         try:
435                                 self.post_process(filename, info_dict)
436                         except (PostProcessingError), err:
437                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
438                                 return
439
440         def download(self, url_list):
441                 """Download a given list of URLs."""
442                 if len(url_list) > 1 and self.fixed_template():
443                         raise SameFileError(self.params['outtmpl'])
444
445                 for url in url_list:
446                         suitable_found = False
447                         for ie in self._ies:
448                                 # Go to next InfoExtractor if not suitable
449                                 if not ie.suitable(url):
450                                         continue
451
452                                 # Suitable InfoExtractor found
453                                 suitable_found = True
454
455                                 # Extract information from URL and process it
456                                 ie.extract(url)
457
458                                 # Suitable InfoExtractor had been found; go to next URL
459                                 break
460
461                         if not suitable_found:
462                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
463
464                 return self._download_retcode
465
466         def post_process(self, filename, ie_info):
467                 """Run the postprocessing chain on the given file."""
468                 info = dict(ie_info)
469                 info['filepath'] = filename
470                 for pp in self._pps:
471                         info = pp.run(info)
472                         if info is None:
473                                 break
474         
475         def _download_with_rtmpdump(self, filename, url, player_url):
476                 self.report_destination(filename)
477
478                 # Check for rtmpdump first
479                 try:
480                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
481                 except (OSError, IOError):
482                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
483                         return False
484
485                 # Download using rtmpdump. rtmpdump returns exit code 2 when
486                 # the connection was interrumpted and resuming appears to be
487                 # possible. This is part of rtmpdump's normal usage, AFAIK.
488                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
489                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
490                 while retval == 2 or retval == 1:
491                         prevsize = os.path.getsize(filename)
492                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
493                         time.sleep(5.0) # This seems to be needed
494                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
495                         cursize = os.path.getsize(filename)
496                         if prevsize == cursize and retval == 1:
497                                 break
498                 if retval == 0:
499                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
500                         return True
501                 else:
502                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
503                         return False
504
505         def _do_download(self, filename, url, player_url):
506                 # Attempt to download using rtmpdump
507                 if url.startswith('rtmp'):
508                         return self._download_with_rtmpdump(filename, url, player_url)
509
510                 stream = None
511                 open_mode = 'wb'
512                 basic_request = urllib2.Request(url, None, std_headers)
513                 request = urllib2.Request(url, None, std_headers)
514
515                 # Establish possible resume length
516                 if os.path.isfile(filename):
517                         resume_len = os.path.getsize(filename)
518                 else:
519                         resume_len = 0
520
521                 # Request parameters in case of being able to resume
522                 if self.params.get('continuedl', False) and resume_len != 0:
523                         self.report_resuming_byte(resume_len)
524                         request.add_header('Range','bytes=%d-' % resume_len)
525                         open_mode = 'ab'
526
527                 count = 0
528                 retries = self.params.get('retries', 0)
529                 while count <= retries:
530                         # Establish connection
531                         try:
532                                 data = urllib2.urlopen(request)
533                                 break
534                         except (urllib2.HTTPError, ), err:
535                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
536                                         # Unexpected HTTP error
537                                         raise
538                                 elif err.code == 416:
539                                         # Unable to resume (requested range not satisfiable)
540                                         try:
541                                                 # Open the connection again without the range header
542                                                 data = urllib2.urlopen(basic_request)
543                                                 content_length = data.info()['Content-Length']
544                                         except (urllib2.HTTPError, ), err:
545                                                 if err.code < 500 or err.code >= 600:
546                                                         raise
547                                         else:
548                                                 # Examine the reported length
549                                                 if (content_length is not None and
550                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
551                                                         # The file had already been fully downloaded.
552                                                         # Explanation to the above condition: in issue #175 it was revealed that
553                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
554                                                         # changing the file size slightly and causing problems for some users. So
555                                                         # I decided to implement a suggested change and consider the file
556                                                         # completely downloaded if the file size differs less than 100 bytes from
557                                                         # the one in the hard drive.
558                                                         self.report_file_already_downloaded(filename)
559                                                         return True
560                                                 else:
561                                                         # The length does not match, we start the download over
562                                                         self.report_unable_to_resume()
563                                                         open_mode = 'wb'
564                                                         break
565                         # Retry
566                         count += 1
567                         if count <= retries:
568                                 self.report_retry(count, retries)
569
570                 if count > retries:
571                         self.trouble(u'ERROR: giving up after %s retries' % retries)
572                         return False
573
574                 data_len = data.info().get('Content-length', None)
575                 data_len_str = self.format_bytes(data_len)
576                 byte_counter = 0
577                 block_size = 1024
578                 start = time.time()
579                 while True:
580                         # Download and write
581                         before = time.time()
582                         data_block = data.read(block_size)
583                         after = time.time()
584                         data_block_len = len(data_block)
585                         if data_block_len == 0:
586                                 break
587                         byte_counter += data_block_len
588
589                         # Open file just in time
590                         if stream is None:
591                                 try:
592                                         (stream, filename) = sanitize_open(filename, open_mode)
593                                         self.report_destination(filename)
594                                 except (OSError, IOError), err:
595                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
596                                         return False
597                         try:
598                                 stream.write(data_block)
599                         except (IOError, OSError), err:
600                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
601                                 return False
602                         block_size = self.best_block_size(after - before, data_block_len)
603
604                         # Progress message
605                         percent_str = self.calc_percent(byte_counter, data_len)
606                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
607                         speed_str = self.calc_speed(start, time.time(), byte_counter)
608                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
609
610                         # Apply rate limit
611                         self.slow_down(start, byte_counter)
612
613                 self.report_finish()
614                 if data_len is not None and str(byte_counter) != data_len:
615                         raise ContentTooShortError(byte_counter, long(data_len))
616                 return True
617
618 class InfoExtractor(object):
619         """Information Extractor class.
620
621         Information extractors are the classes that, given a URL, extract
622         information from the video (or videos) the URL refers to. This
623         information includes the real video URL, the video title and simplified
624         title, author and others. The information is stored in a dictionary
625         which is then passed to the FileDownloader. The FileDownloader
626         processes this information possibly downloading the video to the file
627         system, among other possible outcomes. The dictionaries must include
628         the following fields:
629
630         id:             Video identifier.
631         url:            Final video URL.
632         uploader:       Nickname of the video uploader.
633         title:          Literal title.
634         stitle:         Simplified title.
635         ext:            Video filename extension.
636         format:         Video format.
637         player_url:     SWF Player URL (may be None).
638
639         The following fields are optional. Their primary purpose is to allow
640         youtube-dl to serve as the backend for a video search function, such
641         as the one in youtube2mp3.  They are only used when their respective
642         forced printing functions are called:
643
644         thumbnail:      Full URL to a video thumbnail image.
645         description:    One-line video description.
646
647         Subclasses of this one should re-define the _real_initialize() and
648         _real_extract() methods, as well as the suitable() static method.
649         Probably, they should also be instantiated and added to the main
650         downloader.
651         """
652
653         _ready = False
654         _downloader = None
655
656         def __init__(self, downloader=None):
657                 """Constructor. Receives an optional downloader."""
658                 self._ready = False
659                 self.set_downloader(downloader)
660
661         @staticmethod
662         def suitable(url):
663                 """Receives a URL and returns True if suitable for this IE."""
664                 return False
665
666         def initialize(self):
667                 """Initializes an instance (authentication, etc)."""
668                 if not self._ready:
669                         self._real_initialize()
670                         self._ready = True
671
672         def extract(self, url):
673                 """Extracts URL information and returns it in list of dicts."""
674                 self.initialize()
675                 return self._real_extract(url)
676
677         def set_downloader(self, downloader):
678                 """Sets the downloader for this IE."""
679                 self._downloader = downloader
680         
681         def _real_initialize(self):
682                 """Real initialization process. Redefine in subclasses."""
683                 pass
684
685         def _real_extract(self, url):
686                 """Real extraction process. Redefine in subclasses."""
687                 pass
688
689 class YoutubeIE(InfoExtractor):
690         """Information extractor for youtube.com."""
691
692         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
693         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
694         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
695         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
696         _NETRC_MACHINE = 'youtube'
697         # Listed in order of quality
698         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
699         _video_extensions = {
700                 '13': '3gp',
701                 '17': 'mp4',
702                 '18': 'mp4',
703                 '22': 'mp4',
704                 '37': 'mp4',
705                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
706                 '43': 'webm',
707                 '45': 'webm',
708         }
709
710         @staticmethod
711         def suitable(url):
712                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
713
714         def report_lang(self):
715                 """Report attempt to set language."""
716                 self._downloader.to_stdout(u'[youtube] Setting language')
717
718         def report_login(self):
719                 """Report attempt to log in."""
720                 self._downloader.to_stdout(u'[youtube] Logging in')
721         
722         def report_age_confirmation(self):
723                 """Report attempt to confirm age."""
724                 self._downloader.to_stdout(u'[youtube] Confirming age')
725         
726         def report_video_webpage_download(self, video_id):
727                 """Report attempt to download video webpage."""
728                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
729         
730         def report_video_info_webpage_download(self, video_id):
731                 """Report attempt to download video info webpage."""
732                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
733         
734         def report_information_extraction(self, video_id):
735                 """Report attempt to extract video information."""
736                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
737         
738         def report_unavailable_format(self, video_id, format):
739                 """Report extracted video URL."""
740                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
741         
742         def report_rtmp_download(self):
743                 """Indicate the download will use the RTMP protocol."""
744                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
745         
746         def _real_initialize(self):
747                 if self._downloader is None:
748                         return
749
750                 username = None
751                 password = None
752                 downloader_params = self._downloader.params
753
754                 # Attempt to use provided username and password or .netrc data
755                 if downloader_params.get('username', None) is not None:
756                         username = downloader_params['username']
757                         password = downloader_params['password']
758                 elif downloader_params.get('usenetrc', False):
759                         try:
760                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
761                                 if info is not None:
762                                         username = info[0]
763                                         password = info[2]
764                                 else:
765                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
766                         except (IOError, netrc.NetrcParseError), err:
767                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
768                                 return
769
770                 # Set language
771                 request = urllib2.Request(self._LANG_URL, None, std_headers)
772                 try:
773                         self.report_lang()
774                         urllib2.urlopen(request).read()
775                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
776                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
777                         return
778
779                 # No authentication to be performed
780                 if username is None:
781                         return
782
783                 # Log in
784                 login_form = {
785                                 'current_form': 'loginForm',
786                                 'next':         '/',
787                                 'action_login': 'Log In',
788                                 'username':     username,
789                                 'password':     password,
790                                 }
791                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
792                 try:
793                         self.report_login()
794                         login_results = urllib2.urlopen(request).read()
795                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
796                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
797                                 return
798                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
799                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
800                         return
801         
802                 # Confirm age
803                 age_form = {
804                                 'next_url':             '/',
805                                 'action_confirm':       'Confirm',
806                                 }
807                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
808                 try:
809                         self.report_age_confirmation()
810                         age_results = urllib2.urlopen(request).read()
811                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
812                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
813                         return
814
815         def _real_extract(self, url):
816                 # Extract video id from URL
817                 mobj = re.match(self._VALID_URL, url)
818                 if mobj is None:
819                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
820                         return
821                 video_id = mobj.group(2)
822
823                 # Get video webpage
824                 self.report_video_webpage_download(video_id)
825                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
826                 try:
827                         video_webpage = urllib2.urlopen(request).read()
828                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
829                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
830                         return
831
832                 # Attempt to extract SWF player URL
833                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
834                 if mobj is not None:
835                         player_url = mobj.group(1)
836                 else:
837                         player_url = None
838
839                 # Get video info
840                 self.report_video_info_webpage_download(video_id)
841                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
842                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
843                                            % (video_id, el_type))
844                         request = urllib2.Request(video_info_url, None, std_headers)
845                         try:
846                                 video_info_webpage = urllib2.urlopen(request).read()
847                                 video_info = parse_qs(video_info_webpage)
848                                 if 'token' in video_info:
849                                         break
850                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
851                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
852                                 return
853                 if 'token' not in video_info:
854                         if 'reason' in video_info:
855                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
856                         else:
857                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
858                         return
859
860                 # Start extracting information
861                 self.report_information_extraction(video_id)
862
863                 # uploader
864                 if 'author' not in video_info:
865                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
866                         return
867                 video_uploader = urllib.unquote_plus(video_info['author'][0])
868
869                 # title
870                 if 'title' not in video_info:
871                         self._downloader.trouble(u'ERROR: unable to extract video title')
872                         return
873                 video_title = urllib.unquote_plus(video_info['title'][0])
874                 video_title = video_title.decode('utf-8')
875                 video_title = sanitize_title(video_title)
876
877                 # simplified title
878                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
879                 simple_title = simple_title.strip(ur'_')
880
881                 # thumbnail image
882                 if 'thumbnail_url' not in video_info:
883                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
884                         video_thumbnail = ''
885                 else:   # don't panic if we can't find it
886                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
887
888                 # description
889                 video_description = 'No description available.'
890                 if self._downloader.params.get('forcedescription', False):
891                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
892                         if mobj is not None:
893                                 video_description = mobj.group(1)
894
895                 # token
896                 video_token = urllib.unquote_plus(video_info['token'][0])
897
898                 # Decide which formats to download
899                 requested_format = self._downloader.params.get('format', None)
900                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
901
902                 if 'fmt_url_map' in video_info:
903                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
904                         format_limit = self._downloader.params.get('format_limit', None)
905                         if format_limit is not None and format_limit in self._available_formats:
906                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
907                         else:
908                                 format_list = self._available_formats
909                         existing_formats = [x for x in format_list if x in url_map]
910                         if len(existing_formats) == 0:
911                                 self._downloader.trouble(u'ERROR: no known formats available for video')
912                                 return
913                         if requested_format is None:
914                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
915                         elif requested_format == '-1':
916                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
917                         else:
918                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
919
920                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
921                         self.report_rtmp_download()
922                         video_url_list = [(None, video_info['conn'][0])]
923
924                 else:
925                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
926                         return
927
928                 for format_param, video_real_url in video_url_list:
929                         # At this point we have a new video
930                         self._downloader.increment_downloads()
931
932                         # Extension
933                         video_extension = self._video_extensions.get(format_param, 'flv')
934
935                         # Find the video URL in fmt_url_map or conn paramters
936                         try:
937                                 # Process video information
938                                 self._downloader.process_info({
939                                         'id':           video_id.decode('utf-8'),
940                                         'url':          video_real_url.decode('utf-8'),
941                                         'uploader':     video_uploader.decode('utf-8'),
942                                         'title':        video_title,
943                                         'stitle':       simple_title,
944                                         'ext':          video_extension.decode('utf-8'),
945                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
946                                         'thumbnail':    video_thumbnail.decode('utf-8'),
947                                         'description':  video_description.decode('utf-8'),
948                                         'player_url':   player_url,
949                                 })
950                         except UnavailableVideoError, err:
951                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
952
953
954 class MetacafeIE(InfoExtractor):
955         """Information Extractor for metacafe.com."""
956
957         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
958         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
959         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
960         _youtube_ie = None
961
962         def __init__(self, youtube_ie, downloader=None):
963                 InfoExtractor.__init__(self, downloader)
964                 self._youtube_ie = youtube_ie
965
966         @staticmethod
967         def suitable(url):
968                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
969
970         def report_disclaimer(self):
971                 """Report disclaimer retrieval."""
972                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
973
974         def report_age_confirmation(self):
975                 """Report attempt to confirm age."""
976                 self._downloader.to_stdout(u'[metacafe] Confirming age')
977         
978         def report_download_webpage(self, video_id):
979                 """Report webpage download."""
980                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
981         
982         def report_extraction(self, video_id):
983                 """Report information extraction."""
984                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
985
986         def _real_initialize(self):
987                 # Retrieve disclaimer
988                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
989                 try:
990                         self.report_disclaimer()
991                         disclaimer = urllib2.urlopen(request).read()
992                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
993                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
994                         return
995
996                 # Confirm age
997                 disclaimer_form = {
998                         'filters': '0',
999                         'submit': "Continue - I'm over 18",
1000                         }
1001                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1002                 try:
1003                         self.report_age_confirmation()
1004                         disclaimer = urllib2.urlopen(request).read()
1005                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1006                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1007                         return
1008         
1009         def _real_extract(self, url):
1010                 # Extract id and simplified title from URL
1011                 mobj = re.match(self._VALID_URL, url)
1012                 if mobj is None:
1013                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1014                         return
1015
1016                 video_id = mobj.group(1)
1017
1018                 # Check if video comes from YouTube
1019                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1020                 if mobj2 is not None:
1021                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1022                         return
1023
1024                 # At this point we have a new video
1025                 self._downloader.increment_downloads()
1026
1027                 simple_title = mobj.group(2).decode('utf-8')
1028
1029                 # Retrieve video webpage to extract further information
1030                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1031                 try:
1032                         self.report_download_webpage(video_id)
1033                         webpage = urllib2.urlopen(request).read()
1034                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1035                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1036                         return
1037
1038                 # Extract URL, uploader and title from webpage
1039                 self.report_extraction(video_id)
1040                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1041                 if mobj is not None:
1042                         mediaURL = urllib.unquote(mobj.group(1))
1043                         video_extension = mediaURL[-3:]
1044                         
1045                         # Extract gdaKey if available
1046                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1047                         if mobj is None:
1048                                 video_url = mediaURL
1049                         else:
1050                                 gdaKey = mobj.group(1)
1051                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1052                 else:
1053                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1054                         if mobj is None:
1055                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1056                                 return
1057                         vardict = parse_qs(mobj.group(1))
1058                         if 'mediaData' not in vardict:
1059                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1060                                 return
1061                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1062                         if mobj is None:
1063                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1064                                 return
1065                         mediaURL = mobj.group(1).replace('\\/', '/')
1066                         video_extension = mediaURL[-3:]
1067                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1068
1069                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1070                 if mobj is None:
1071                         self._downloader.trouble(u'ERROR: unable to extract title')
1072                         return
1073                 video_title = mobj.group(1).decode('utf-8')
1074                 video_title = sanitize_title(video_title)
1075
1076                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1077                 if mobj is None:
1078                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1079                         return
1080                 video_uploader = mobj.group(1)
1081
1082                 try:
1083                         # Process video information
1084                         self._downloader.process_info({
1085                                 'id':           video_id.decode('utf-8'),
1086                                 'url':          video_url.decode('utf-8'),
1087                                 'uploader':     video_uploader.decode('utf-8'),
1088                                 'title':        video_title,
1089                                 'stitle':       simple_title,
1090                                 'ext':          video_extension.decode('utf-8'),
1091                                 'format':       u'NA',
1092                                 'player_url':   None,
1093                         })
1094                 except UnavailableVideoError:
1095                         self._downloader.trouble(u'ERROR: unable to download video')
1096
1097
1098 class DailymotionIE(InfoExtractor):
1099         """Information Extractor for Dailymotion"""
1100
1101         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1102
1103         def __init__(self, downloader=None):
1104                 InfoExtractor.__init__(self, downloader)
1105
1106         @staticmethod
1107         def suitable(url):
1108                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1109
1110         def report_download_webpage(self, video_id):
1111                 """Report webpage download."""
1112                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1113         
1114         def report_extraction(self, video_id):
1115                 """Report information extraction."""
1116                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1117
1118         def _real_initialize(self):
1119                 return
1120
1121         def _real_extract(self, url):
1122                 # Extract id and simplified title from URL
1123                 mobj = re.match(self._VALID_URL, url)
1124                 if mobj is None:
1125                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1126                         return
1127
1128                 # At this point we have a new video
1129                 self._downloader.increment_downloads()
1130                 video_id = mobj.group(1)
1131
1132                 simple_title = mobj.group(2).decode('utf-8')
1133                 video_extension = 'flv'
1134
1135                 # Retrieve video webpage to extract further information
1136                 request = urllib2.Request(url)
1137                 try:
1138                         self.report_download_webpage(video_id)
1139                         webpage = urllib2.urlopen(request).read()
1140                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1141                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1142                         return
1143
1144                 # Extract URL, uploader and title from webpage
1145                 self.report_extraction(video_id)
1146                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1147                 if mobj is None:
1148                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1149                         return
1150                 mediaURL = urllib.unquote(mobj.group(1))
1151
1152                 # if needed add http://www.dailymotion.com/ if relative URL
1153
1154                 video_url = mediaURL
1155
1156                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1157                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1158                 if mobj is None:
1159                         self._downloader.trouble(u'ERROR: unable to extract title')
1160                         return
1161                 video_title = mobj.group(1).decode('utf-8')
1162                 video_title = sanitize_title(video_title)
1163
1164                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1165                 if mobj is None:
1166                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1167                         return
1168                 video_uploader = mobj.group(1)
1169
1170                 try:
1171                         # Process video information
1172                         self._downloader.process_info({
1173                                 'id':           video_id.decode('utf-8'),
1174                                 'url':          video_url.decode('utf-8'),
1175                                 'uploader':     video_uploader.decode('utf-8'),
1176                                 'title':        video_title,
1177                                 'stitle':       simple_title,
1178                                 'ext':          video_extension.decode('utf-8'),
1179                                 'format':       u'NA',
1180                                 'player_url':   None,
1181                         })
1182                 except UnavailableVideoError:
1183                         self._downloader.trouble(u'ERROR: unable to download video')
1184
1185 class GoogleIE(InfoExtractor):
1186         """Information extractor for video.google.com."""
1187
1188         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1189
1190         def __init__(self, downloader=None):
1191                 InfoExtractor.__init__(self, downloader)
1192
1193         @staticmethod
1194         def suitable(url):
1195                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1196
1197         def report_download_webpage(self, video_id):
1198                 """Report webpage download."""
1199                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1200
1201         def report_extraction(self, video_id):
1202                 """Report information extraction."""
1203                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1204
1205         def _real_initialize(self):
1206                 return
1207
1208         def _real_extract(self, url):
1209                 # Extract id from URL
1210                 mobj = re.match(self._VALID_URL, url)
1211                 if mobj is None:
1212                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1213                         return
1214
1215                 # At this point we have a new video
1216                 self._downloader.increment_downloads()
1217                 video_id = mobj.group(1)
1218
1219                 video_extension = 'mp4'
1220
1221                 # Retrieve video webpage to extract further information
1222                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1223                 try:
1224                         self.report_download_webpage(video_id)
1225                         webpage = urllib2.urlopen(request).read()
1226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1228                         return
1229
1230                 # Extract URL, uploader, and title from webpage
1231                 self.report_extraction(video_id)
1232                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1233                 if mobj is None:
1234                         video_extension = 'flv'
1235                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1236                 if mobj is None:
1237                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1238                         return
1239                 mediaURL = urllib.unquote(mobj.group(1))
1240                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1241                 mediaURL = mediaURL.replace('\\x26', '\x26')
1242
1243                 video_url = mediaURL
1244
1245                 mobj = re.search(r'<title>(.*)</title>', webpage)
1246                 if mobj is None:
1247                         self._downloader.trouble(u'ERROR: unable to extract title')
1248                         return
1249                 video_title = mobj.group(1).decode('utf-8')
1250                 video_title = sanitize_title(video_title)
1251                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1252
1253                 # Extract video description
1254                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1255                 if mobj is None:
1256                         self._downloader.trouble(u'ERROR: unable to extract video description')
1257                         return
1258                 video_description = mobj.group(1).decode('utf-8')
1259                 if not video_description:
1260                         video_description = 'No description available.'
1261
1262                 # Extract video thumbnail
1263                 if self._downloader.params.get('forcethumbnail', False):
1264                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1265                         try:
1266                                 webpage = urllib2.urlopen(request).read()
1267                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1268                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1269                                 return
1270                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1271                         if mobj is None:
1272                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1273                                 return
1274                         video_thumbnail = mobj.group(1)
1275                 else:   # we need something to pass to process_info
1276                         video_thumbnail = ''
1277
1278
1279                 try:
1280                         # Process video information
1281                         self._downloader.process_info({
1282                                 'id':           video_id.decode('utf-8'),
1283                                 'url':          video_url.decode('utf-8'),
1284                                 'uploader':     u'NA',
1285                                 'title':        video_title,
1286                                 'stitle':       simple_title,
1287                                 'ext':          video_extension.decode('utf-8'),
1288                                 'format':       u'NA',
1289                                 'player_url':   None,
1290                         })
1291                 except UnavailableVideoError:
1292                         self._downloader.trouble(u'ERROR: unable to download video')
1293
1294
1295 class PhotobucketIE(InfoExtractor):
1296         """Information extractor for photobucket.com."""
1297
1298         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1299
1300         def __init__(self, downloader=None):
1301                 InfoExtractor.__init__(self, downloader)
1302
1303         @staticmethod
1304         def suitable(url):
1305                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1306
1307         def report_download_webpage(self, video_id):
1308                 """Report webpage download."""
1309                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1310
1311         def report_extraction(self, video_id):
1312                 """Report information extraction."""
1313                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1314
1315         def _real_initialize(self):
1316                 return
1317
1318         def _real_extract(self, url):
1319                 # Extract id from URL
1320                 mobj = re.match(self._VALID_URL, url)
1321                 if mobj is None:
1322                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1323                         return
1324
1325                 # At this point we have a new video
1326                 self._downloader.increment_downloads()
1327                 video_id = mobj.group(1)
1328
1329                 video_extension = 'flv'
1330
1331                 # Retrieve video webpage to extract further information
1332                 request = urllib2.Request(url)
1333                 try:
1334                         self.report_download_webpage(video_id)
1335                         webpage = urllib2.urlopen(request).read()
1336                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1337                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1338                         return
1339
1340                 # Extract URL, uploader, and title from webpage
1341                 self.report_extraction(video_id)
1342                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1343                 if mobj is None:
1344                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1345                         return
1346                 mediaURL = urllib.unquote(mobj.group(1))
1347
1348                 video_url = mediaURL
1349
1350                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1351                 if mobj is None:
1352                         self._downloader.trouble(u'ERROR: unable to extract title')
1353                         return
1354                 video_title = mobj.group(1).decode('utf-8')
1355                 video_title = sanitize_title(video_title)
1356                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1357
1358                 video_uploader = mobj.group(2).decode('utf-8')
1359
1360                 try:
1361                         # Process video information
1362                         self._downloader.process_info({
1363                                 'id':           video_id.decode('utf-8'),
1364                                 'url':          video_url.decode('utf-8'),
1365                                 'uploader':     video_uploader,
1366                                 'title':        video_title,
1367                                 'stitle':       simple_title,
1368                                 'ext':          video_extension.decode('utf-8'),
1369                                 'format':       u'NA',
1370                                 'player_url':   None,
1371                         })
1372                 except UnavailableVideoError:
1373                         self._downloader.trouble(u'ERROR: unable to download video')
1374
1375
1376 class YahooIE(InfoExtractor):
1377         """Information extractor for video.yahoo.com."""
1378
1379         # _VALID_URL matches all Yahoo! Video URLs
1380         # _VPAGE_URL matches only the extractable '/watch/' URLs
1381         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1382         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1383
1384         def __init__(self, downloader=None):
1385                 InfoExtractor.__init__(self, downloader)
1386
1387         @staticmethod
1388         def suitable(url):
1389                 return (re.match(YahooIE._VALID_URL, url) is not None)
1390
1391         def report_download_webpage(self, video_id):
1392                 """Report webpage download."""
1393                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1394
1395         def report_extraction(self, video_id):
1396                 """Report information extraction."""
1397                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1398
1399         def _real_initialize(self):
1400                 return
1401
1402         def _real_extract(self, url, new_video=True):
1403                 # Extract ID from URL
1404                 mobj = re.match(self._VALID_URL, url)
1405                 if mobj is None:
1406                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1407                         return
1408
1409                 # At this point we have a new video
1410                 self._downloader.increment_downloads()
1411                 video_id = mobj.group(2)
1412                 video_extension = 'flv'
1413
1414                 # Rewrite valid but non-extractable URLs as
1415                 # extractable English language /watch/ URLs
1416                 if re.match(self._VPAGE_URL, url) is None:
1417                         request = urllib2.Request(url)
1418                         try:
1419                                 webpage = urllib2.urlopen(request).read()
1420                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1421                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1422                                 return
1423
1424                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1425                         if mobj is None:
1426                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1427                                 return
1428                         yahoo_id = mobj.group(1)
1429
1430                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1431                         if mobj is None:
1432                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1433                                 return
1434                         yahoo_vid = mobj.group(1)
1435
1436                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1437                         return self._real_extract(url, new_video=False)
1438
1439                 # Retrieve video webpage to extract further information
1440                 request = urllib2.Request(url)
1441                 try:
1442                         self.report_download_webpage(video_id)
1443                         webpage = urllib2.urlopen(request).read()
1444                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1445                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1446                         return
1447
1448                 # Extract uploader and title from webpage
1449                 self.report_extraction(video_id)
1450                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1451                 if mobj is None:
1452                         self._downloader.trouble(u'ERROR: unable to extract video title')
1453                         return
1454                 video_title = mobj.group(1).decode('utf-8')
1455                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1456
1457                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1458                 if mobj is None:
1459                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1460                         return
1461                 video_uploader = mobj.group(1).decode('utf-8')
1462
1463                 # Extract video thumbnail
1464                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1465                 if mobj is None:
1466                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1467                         return
1468                 video_thumbnail = mobj.group(1).decode('utf-8')
1469
1470                 # Extract video description
1471                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1472                 if mobj is None:
1473                         self._downloader.trouble(u'ERROR: unable to extract video description')
1474                         return
1475                 video_description = mobj.group(1).decode('utf-8')
1476                 if not video_description: video_description = 'No description available.'
1477
1478                 # Extract video height and width
1479                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1480                 if mobj is None:
1481                         self._downloader.trouble(u'ERROR: unable to extract video height')
1482                         return
1483                 yv_video_height = mobj.group(1)
1484
1485                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1486                 if mobj is None:
1487                         self._downloader.trouble(u'ERROR: unable to extract video width')
1488                         return
1489                 yv_video_width = mobj.group(1)
1490
1491                 # Retrieve video playlist to extract media URL
1492                 # I'm not completely sure what all these options are, but we
1493                 # seem to need most of them, otherwise the server sends a 401.
1494                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1495                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1496                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1497                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1498                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1499                 try:
1500                         self.report_download_webpage(video_id)
1501                         webpage = urllib2.urlopen(request).read()
1502                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1504                         return
1505
1506                 # Extract media URL from playlist XML
1507                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1508                 if mobj is None:
1509                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1510                         return
1511                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1512                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1513
1514                 try:
1515                         # Process video information
1516                         self._downloader.process_info({
1517                                 'id':           video_id.decode('utf-8'),
1518                                 'url':          video_url,
1519                                 'uploader':     video_uploader,
1520                                 'title':        video_title,
1521                                 'stitle':       simple_title,
1522                                 'ext':          video_extension.decode('utf-8'),
1523                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1524                                 'description':  video_description,
1525                                 'thumbnail':    video_thumbnail,
1526                                 'description':  video_description,
1527                                 'player_url':   None,
1528                         })
1529                 except UnavailableVideoError:
1530                         self._downloader.trouble(u'ERROR: unable to download video')
1531
1532
1533 class GenericIE(InfoExtractor):
1534         """Generic last-resort information extractor."""
1535
1536         def __init__(self, downloader=None):
1537                 InfoExtractor.__init__(self, downloader)
1538
1539         @staticmethod
1540         def suitable(url):
1541                 return True
1542
1543         def report_download_webpage(self, video_id):
1544                 """Report webpage download."""
1545                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1546                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1547
1548         def report_extraction(self, video_id):
1549                 """Report information extraction."""
1550                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1551
1552         def _real_initialize(self):
1553                 return
1554
1555         def _real_extract(self, url):
1556                 # At this point we have a new video
1557                 self._downloader.increment_downloads()
1558
1559                 video_id = url.split('/')[-1]
1560                 request = urllib2.Request(url)
1561                 try:
1562                         self.report_download_webpage(video_id)
1563                         webpage = urllib2.urlopen(request).read()
1564                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1565                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1566                         return
1567                 except ValueError, err:
1568                         # since this is the last-resort InfoExtractor, if
1569                         # this error is thrown, it'll be thrown here
1570                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1571                         return
1572
1573                 # Start with something easy: JW Player in SWFObject
1574                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1575                 if mobj is None:
1576                         # Broaden the search a little bit
1577                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1578                 if mobj is None:
1579                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1580                         return
1581
1582                 # It's possible that one of the regexes
1583                 # matched, but returned an empty group:
1584                 if mobj.group(1) is None:
1585                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1586                         return
1587
1588                 video_url = urllib.unquote(mobj.group(1))
1589                 video_id  = os.path.basename(video_url)
1590
1591                 # here's a fun little line of code for you:
1592                 video_extension = os.path.splitext(video_id)[1][1:]
1593                 video_id        = os.path.splitext(video_id)[0]
1594
1595                 # it's tempting to parse this further, but you would
1596                 # have to take into account all the variations like
1597                 #   Video Title - Site Name
1598                 #   Site Name | Video Title
1599                 #   Video Title - Tagline | Site Name
1600                 # and so on and so forth; it's just not practical
1601                 mobj = re.search(r'<title>(.*)</title>', webpage)
1602                 if mobj is None:
1603                         self._downloader.trouble(u'ERROR: unable to extract title')
1604                         return
1605                 video_title = mobj.group(1).decode('utf-8')
1606                 video_title = sanitize_title(video_title)
1607                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1608
1609                 # video uploader is domain name
1610                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1611                 if mobj is None:
1612                         self._downloader.trouble(u'ERROR: unable to extract title')
1613                         return
1614                 video_uploader = mobj.group(1).decode('utf-8')
1615
1616                 try:
1617                         # Process video information
1618                         self._downloader.process_info({
1619                                 'id':           video_id.decode('utf-8'),
1620                                 'url':          video_url.decode('utf-8'),
1621                                 'uploader':     video_uploader,
1622                                 'title':        video_title,
1623                                 'stitle':       simple_title,
1624                                 'ext':          video_extension.decode('utf-8'),
1625                                 'format':       u'NA',
1626                                 'player_url':   None,
1627                         })
1628                 except UnavailableVideoError, err:
1629                         self._downloader.trouble(u'ERROR: unable to download video')
1630
1631
1632 class YoutubeSearchIE(InfoExtractor):
1633         """Information Extractor for YouTube search queries."""
1634         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1635         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1636         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1637         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1638         _youtube_ie = None
1639         _max_youtube_results = 1000
1640
1641         def __init__(self, youtube_ie, downloader=None):
1642                 InfoExtractor.__init__(self, downloader)
1643                 self._youtube_ie = youtube_ie
1644         
1645         @staticmethod
1646         def suitable(url):
1647                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1648
1649         def report_download_page(self, query, pagenum):
1650                 """Report attempt to download playlist page with given number."""
1651                 query = query.decode(preferredencoding())
1652                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1653
1654         def _real_initialize(self):
1655                 self._youtube_ie.initialize()
1656         
1657         def _real_extract(self, query):
1658                 mobj = re.match(self._VALID_QUERY, query)
1659                 if mobj is None:
1660                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1661                         return
1662
1663                 prefix, query = query.split(':')
1664                 prefix = prefix[8:]
1665                 query  = query.encode('utf-8')
1666                 if prefix == '':
1667                         self._download_n_results(query, 1)
1668                         return
1669                 elif prefix == 'all':
1670                         self._download_n_results(query, self._max_youtube_results)
1671                         return
1672                 else:
1673                         try:
1674                                 n = long(prefix)
1675                                 if n <= 0:
1676                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1677                                         return
1678                                 elif n > self._max_youtube_results:
1679                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1680                                         n = self._max_youtube_results
1681                                 self._download_n_results(query, n)
1682                                 return
1683                         except ValueError: # parsing prefix as integer fails
1684                                 self._download_n_results(query, 1)
1685                                 return
1686
1687         def _download_n_results(self, query, n):
1688                 """Downloads a specified number of results for a query"""
1689
1690                 video_ids = []
1691                 already_seen = set()
1692                 pagenum = 1
1693
1694                 while True:
1695                         self.report_download_page(query, pagenum)
1696                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1697                         request = urllib2.Request(result_url, None, std_headers)
1698                         try:
1699                                 page = urllib2.urlopen(request).read()
1700                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1701                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1702                                 return
1703
1704                         # Extract video identifiers
1705                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1706                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1707                                 if video_id not in already_seen:
1708                                         video_ids.append(video_id)
1709                                         already_seen.add(video_id)
1710                                         if len(video_ids) == n:
1711                                                 # Specified n videos reached
1712                                                 for id in video_ids:
1713                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1714                                                 return
1715
1716                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1717                                 for id in video_ids:
1718                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1719                                 return
1720
1721                         pagenum = pagenum + 1
1722
1723 class GoogleSearchIE(InfoExtractor):
1724         """Information Extractor for Google Video search queries."""
1725         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1726         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1727         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1728         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1729         _google_ie = None
1730         _max_google_results = 1000
1731
1732         def __init__(self, google_ie, downloader=None):
1733                 InfoExtractor.__init__(self, downloader)
1734                 self._google_ie = google_ie
1735         
1736         @staticmethod
1737         def suitable(url):
1738                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1739
1740         def report_download_page(self, query, pagenum):
1741                 """Report attempt to download playlist page with given number."""
1742                 query = query.decode(preferredencoding())
1743                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1744
1745         def _real_initialize(self):
1746                 self._google_ie.initialize()
1747         
1748         def _real_extract(self, query):
1749                 mobj = re.match(self._VALID_QUERY, query)
1750                 if mobj is None:
1751                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1752                         return
1753
1754                 prefix, query = query.split(':')
1755                 prefix = prefix[8:]
1756                 query  = query.encode('utf-8')
1757                 if prefix == '':
1758                         self._download_n_results(query, 1)
1759                         return
1760                 elif prefix == 'all':
1761                         self._download_n_results(query, self._max_google_results)
1762                         return
1763                 else:
1764                         try:
1765                                 n = long(prefix)
1766                                 if n <= 0:
1767                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1768                                         return
1769                                 elif n > self._max_google_results:
1770                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1771                                         n = self._max_google_results
1772                                 self._download_n_results(query, n)
1773                                 return
1774                         except ValueError: # parsing prefix as integer fails
1775                                 self._download_n_results(query, 1)
1776                                 return
1777
1778         def _download_n_results(self, query, n):
1779                 """Downloads a specified number of results for a query"""
1780
1781                 video_ids = []
1782                 already_seen = set()
1783                 pagenum = 1
1784
1785                 while True:
1786                         self.report_download_page(query, pagenum)
1787                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1788                         request = urllib2.Request(result_url, None, std_headers)
1789                         try:
1790                                 page = urllib2.urlopen(request).read()
1791                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1792                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1793                                 return
1794
1795                         # Extract video identifiers
1796                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1797                                 video_id = mobj.group(1)
1798                                 if video_id not in already_seen:
1799                                         video_ids.append(video_id)
1800                                         already_seen.add(video_id)
1801                                         if len(video_ids) == n:
1802                                                 # Specified n videos reached
1803                                                 for id in video_ids:
1804                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1805                                                 return
1806
1807                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1808                                 for id in video_ids:
1809                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1810                                 return
1811
1812                         pagenum = pagenum + 1
1813
1814 class YahooSearchIE(InfoExtractor):
1815         """Information Extractor for Yahoo! Video search queries."""
1816         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1817         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1818         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1819         _MORE_PAGES_INDICATOR = r'\s*Next'
1820         _yahoo_ie = None
1821         _max_yahoo_results = 1000
1822
1823         def __init__(self, yahoo_ie, downloader=None):
1824                 InfoExtractor.__init__(self, downloader)
1825                 self._yahoo_ie = yahoo_ie
1826         
1827         @staticmethod
1828         def suitable(url):
1829                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1830
1831         def report_download_page(self, query, pagenum):
1832                 """Report attempt to download playlist page with given number."""
1833                 query = query.decode(preferredencoding())
1834                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1835
1836         def _real_initialize(self):
1837                 self._yahoo_ie.initialize()
1838         
1839         def _real_extract(self, query):
1840                 mobj = re.match(self._VALID_QUERY, query)
1841                 if mobj is None:
1842                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1843                         return
1844
1845                 prefix, query = query.split(':')
1846                 prefix = prefix[8:]
1847                 query  = query.encode('utf-8')
1848                 if prefix == '':
1849                         self._download_n_results(query, 1)
1850                         return
1851                 elif prefix == 'all':
1852                         self._download_n_results(query, self._max_yahoo_results)
1853                         return
1854                 else:
1855                         try:
1856                                 n = long(prefix)
1857                                 if n <= 0:
1858                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1859                                         return
1860                                 elif n > self._max_yahoo_results:
1861                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1862                                         n = self._max_yahoo_results
1863                                 self._download_n_results(query, n)
1864                                 return
1865                         except ValueError: # parsing prefix as integer fails
1866                                 self._download_n_results(query, 1)
1867                                 return
1868
1869         def _download_n_results(self, query, n):
1870                 """Downloads a specified number of results for a query"""
1871
1872                 video_ids = []
1873                 already_seen = set()
1874                 pagenum = 1
1875
1876                 while True:
1877                         self.report_download_page(query, pagenum)
1878                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1879                         request = urllib2.Request(result_url, None, std_headers)
1880                         try:
1881                                 page = urllib2.urlopen(request).read()
1882                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1883                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1884                                 return
1885
1886                         # Extract video identifiers
1887                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1888                                 video_id = mobj.group(1)
1889                                 if video_id not in already_seen:
1890                                         video_ids.append(video_id)
1891                                         already_seen.add(video_id)
1892                                         if len(video_ids) == n:
1893                                                 # Specified n videos reached
1894                                                 for id in video_ids:
1895                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1896                                                 return
1897
1898                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1899                                 for id in video_ids:
1900                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1901                                 return
1902
1903                         pagenum = pagenum + 1
1904
1905 class YoutubePlaylistIE(InfoExtractor):
1906         """Information Extractor for YouTube playlists."""
1907
1908         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1909         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1910         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1911         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1912         _youtube_ie = None
1913
1914         def __init__(self, youtube_ie, downloader=None):
1915                 InfoExtractor.__init__(self, downloader)
1916                 self._youtube_ie = youtube_ie
1917         
1918         @staticmethod
1919         def suitable(url):
1920                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1921
1922         def report_download_page(self, playlist_id, pagenum):
1923                 """Report attempt to download playlist page with given number."""
1924                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1925
1926         def _real_initialize(self):
1927                 self._youtube_ie.initialize()
1928         
1929         def _real_extract(self, url):
1930                 # Extract playlist id
1931                 mobj = re.match(self._VALID_URL, url)
1932                 if mobj is None:
1933                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1934                         return
1935
1936                 # Download playlist pages
1937                 playlist_id = mobj.group(1)
1938                 video_ids = []
1939                 pagenum = 1
1940
1941                 while True:
1942                         self.report_download_page(playlist_id, pagenum)
1943                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1944                         try:
1945                                 page = urllib2.urlopen(request).read()
1946                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1947                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1948                                 return
1949
1950                         # Extract video identifiers
1951                         ids_in_page = []
1952                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1953                                 if mobj.group(1) not in ids_in_page:
1954                                         ids_in_page.append(mobj.group(1))
1955                         video_ids.extend(ids_in_page)
1956
1957                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1958                                 break
1959                         pagenum = pagenum + 1
1960
1961                 playliststart = self._downloader.params.get('playliststart', 1)
1962                 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1963                 if playliststart > 0:
1964                         video_ids = video_ids[playliststart:]
1965                         
1966                 for id in video_ids:
1967                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1968                 return
1969
1970 class YoutubeUserIE(InfoExtractor):
1971         """Information Extractor for YouTube users."""
1972
1973         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1974         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1975         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1976         _youtube_ie = None
1977
1978         def __init__(self, youtube_ie, downloader=None):
1979                 InfoExtractor.__init__(self, downloader)
1980                 self._youtube_ie = youtube_ie
1981         
1982         @staticmethod
1983         def suitable(url):
1984                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1985
1986         def report_download_page(self, username):
1987                 """Report attempt to download user page."""
1988                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1989
1990         def _real_initialize(self):
1991                 self._youtube_ie.initialize()
1992         
1993         def _real_extract(self, url):
1994                 # Extract username
1995                 mobj = re.match(self._VALID_URL, url)
1996                 if mobj is None:
1997                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1998                         return
1999
2000                 # Download user page
2001                 username = mobj.group(1)
2002                 video_ids = []
2003                 pagenum = 1
2004
2005                 self.report_download_page(username)
2006                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2007                 try:
2008                         page = urllib2.urlopen(request).read()
2009                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2010                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2011                         return
2012
2013                 # Extract video identifiers
2014                 ids_in_page = []
2015
2016                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2017                         if mobj.group(1) not in ids_in_page:
2018                                 ids_in_page.append(mobj.group(1))
2019                 video_ids.extend(ids_in_page)
2020
2021                 playliststart = self._downloader.params.get('playliststart', 1)
2022                 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2023                 if playliststart > 0:
2024                         video_ids = video_ids[playliststart:]   
2025
2026                 for id in video_ids:
2027                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2028                 return
2029
2030 class PostProcessor(object):
2031         """Post Processor class.
2032
2033         PostProcessor objects can be added to downloaders with their
2034         add_post_processor() method. When the downloader has finished a
2035         successful download, it will take its internal chain of PostProcessors
2036         and start calling the run() method on each one of them, first with
2037         an initial argument and then with the returned value of the previous
2038         PostProcessor.
2039
2040         The chain will be stopped if one of them ever returns None or the end
2041         of the chain is reached.
2042
2043         PostProcessor objects follow a "mutual registration" process similar
2044         to InfoExtractor objects.
2045         """
2046
2047         _downloader = None
2048
2049         def __init__(self, downloader=None):
2050                 self._downloader = downloader
2051
2052         def set_downloader(self, downloader):
2053                 """Sets the downloader for this PP."""
2054                 self._downloader = downloader
2055         
2056         def run(self, information):
2057                 """Run the PostProcessor.
2058
2059                 The "information" argument is a dictionary like the ones
2060                 composed by InfoExtractors. The only difference is that this
2061                 one has an extra field called "filepath" that points to the
2062                 downloaded file.
2063
2064                 When this method returns None, the postprocessing chain is
2065                 stopped. However, this method may return an information
2066                 dictionary that will be passed to the next postprocessing
2067                 object in the chain. It can be the one it received after
2068                 changing some fields.
2069
2070                 In addition, this method may raise a PostProcessingError
2071                 exception that will be taken into account by the downloader
2072                 it was called from.
2073                 """
2074                 return information # by default, do nothing
2075         
2076 ### MAIN PROGRAM ###
2077 if __name__ == '__main__':
2078         try:
2079                 # Modules needed only when running the main program
2080                 import getpass
2081                 import optparse
2082
2083                 # Function to update the program file with the latest version from bitbucket.org
2084                 def update_self(downloader, filename):
2085                         # Note: downloader only used for options
2086                         if not os.access (filename, os.W_OK):
2087                                 sys.exit('ERROR: no write permissions on %s' % filename)
2088
2089                         downloader.to_stdout('Updating to latest stable version...')
2090                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2091                         latest_version = urllib.urlopen(latest_url).read().strip()
2092                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2093                         newcontent = urllib.urlopen(prog_url).read()
2094                         stream = open(filename, 'w')
2095                         stream.write(newcontent)
2096                         stream.close()
2097                         downloader.to_stdout('Updated to version %s' % latest_version)
2098
2099                 # General configuration
2100                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2101                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2102                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2103
2104                 # Parse command line
2105                 parser = optparse.OptionParser(
2106                         usage='Usage: %prog [options] url...',
2107                         version='2010.10.03',
2108                         conflict_handler='resolve',
2109                 )
2110
2111                 parser.add_option('-h', '--help',
2112                                 action='help', help='print this help text and exit')
2113                 parser.add_option('-v', '--version',
2114                                 action='version', help='print program version and exit')
2115                 parser.add_option('-U', '--update',
2116                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2117                 parser.add_option('-i', '--ignore-errors',
2118                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2119                 parser.add_option('-r', '--rate-limit',
2120                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2121                 parser.add_option('-R', '--retries',
2122                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2123                 parser.add_option('--playlist-start',
2124                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2125
2126                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2127                 authentication.add_option('-u', '--username',
2128                                 dest='username', metavar='USERNAME', help='account username')
2129                 authentication.add_option('-p', '--password',
2130                                 dest='password', metavar='PASSWORD', help='account password')
2131                 authentication.add_option('-n', '--netrc',
2132                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2133                 parser.add_option_group(authentication)
2134
2135                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2136                 video_format.add_option('-f', '--format',
2137                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2138                 video_format.add_option('-m', '--mobile-version',
2139                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2140                 video_format.add_option('--all-formats',
2141                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2142                 video_format.add_option('--max-quality',
2143                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2144                 video_format.add_option('-b', '--best-quality',
2145                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2146                 parser.add_option_group(video_format)
2147
2148                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2149                 verbosity.add_option('-q', '--quiet',
2150                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2151                 verbosity.add_option('-s', '--simulate',
2152                                 action='store_true', dest='simulate', help='do not download video', default=False)
2153                 verbosity.add_option('-g', '--get-url',
2154                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2155                 verbosity.add_option('-e', '--get-title',
2156                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2157                 verbosity.add_option('--get-thumbnail',
2158                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2159                 verbosity.add_option('--get-description',
2160                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2161                 verbosity.add_option('--no-progress',
2162                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2163                 parser.add_option_group(verbosity)
2164
2165                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2166                 filesystem.add_option('-t', '--title',
2167                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2168                 filesystem.add_option('-l', '--literal',
2169                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2170                 filesystem.add_option('-o', '--output',
2171                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2172                 filesystem.add_option('-a', '--batch-file',
2173                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2174                 filesystem.add_option('-w', '--no-overwrites',
2175                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2176                 filesystem.add_option('-c', '--continue',
2177                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2178                 parser.add_option_group(filesystem)
2179
2180                 (opts, args) = parser.parse_args()
2181
2182                 # Batch file verification
2183                 batchurls = []
2184                 if opts.batchfile is not None:
2185                         try:
2186                                 if opts.batchfile == '-':
2187                                         batchfd = sys.stdin
2188                                 else:
2189                                         batchfd = open(opts.batchfile, 'r')
2190                                 batchurls = batchfd.readlines()
2191                                 batchurls = [x.strip() for x in batchurls]
2192                                 batchurls = [x for x in batchurls if len(x) > 0]
2193                         except IOError:
2194                                 sys.exit(u'ERROR: batch file could not be read')
2195                 all_urls = batchurls + args
2196
2197                 # Conflicting, missing and erroneous options
2198                 if opts.bestquality:
2199                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2200                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2201                         parser.error(u'using .netrc conflicts with giving username/password')
2202                 if opts.password is not None and opts.username is None:
2203                         parser.error(u'account username missing')
2204                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2205                         parser.error(u'using output template conflicts with using title or literal title')
2206                 if opts.usetitle and opts.useliteral:
2207                         parser.error(u'using title conflicts with using literal title')
2208                 if opts.username is not None and opts.password is None:
2209                         opts.password = getpass.getpass(u'Type account password and press return:')
2210                 if opts.ratelimit is not None:
2211                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2212                         if numeric_limit is None:
2213                                 parser.error(u'invalid rate limit specified')
2214                         opts.ratelimit = numeric_limit
2215                 if opts.retries is not None:
2216                         try:
2217                                 opts.retries = long(opts.retries)
2218                         except (TypeError, ValueError), err:
2219                                 parser.error(u'invalid retry count specified')
2220                 if opts.playliststart is not None:
2221                         try:
2222                                 opts.playliststart = long(opts.playliststart)
2223                         except (TypeError, ValueError), err:
2224                                 parser.error(u'invalid playlist page specified')
2225
2226                 # Information extractors
2227                 youtube_ie = YoutubeIE()
2228                 metacafe_ie = MetacafeIE(youtube_ie)
2229                 dailymotion_ie = DailymotionIE()
2230                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2231                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2232                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2233                 google_ie = GoogleIE()
2234                 google_search_ie = GoogleSearchIE(google_ie)
2235                 photobucket_ie = PhotobucketIE()
2236                 yahoo_ie = YahooIE()
2237                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2238                 generic_ie = GenericIE()
2239
2240                 # File downloader
2241                 fd = FileDownloader({
2242                         'usenetrc': opts.usenetrc,
2243                         'username': opts.username,
2244                         'password': opts.password,
2245                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2246                         'forceurl': opts.geturl,
2247                         'forcetitle': opts.gettitle,
2248                         'forcethumbnail': opts.getthumbnail,
2249                         'forcedescription': opts.getdescription,
2250                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2251                         'format': opts.format,
2252                         'format_limit': opts.format_limit,
2253                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2254                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2255                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2256                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2257                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2258                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2259                                 or u'%(id)s.%(ext)s'),
2260                         'ignoreerrors': opts.ignoreerrors,
2261                         'ratelimit': opts.ratelimit,
2262                         'nooverwrites': opts.nooverwrites,
2263                         'retries': opts.retries,
2264                         'continuedl': opts.continue_dl,
2265                         'noprogress': opts.noprogress,
2266                         'playliststart': opts.playliststart,
2267                         })
2268                 fd.add_info_extractor(youtube_search_ie)
2269                 fd.add_info_extractor(youtube_pl_ie)
2270                 fd.add_info_extractor(youtube_user_ie)
2271                 fd.add_info_extractor(metacafe_ie)
2272                 fd.add_info_extractor(dailymotion_ie)
2273                 fd.add_info_extractor(youtube_ie)
2274                 fd.add_info_extractor(google_ie)
2275                 fd.add_info_extractor(google_search_ie)
2276                 fd.add_info_extractor(photobucket_ie)
2277                 fd.add_info_extractor(yahoo_ie)
2278                 fd.add_info_extractor(yahoo_search_ie)
2279
2280                 # This must come last since it's the
2281                 # fallback if none of the others work
2282                 fd.add_info_extractor(generic_ie)
2283
2284                 # Update version
2285                 if opts.update_self:
2286                         update_self(fd, sys.argv[0])
2287
2288                 # Maybe do nothing
2289                 if len(all_urls) < 1:
2290                         if not opts.update_self:
2291                                 parser.error(u'you must provide at least one URL')
2292                         else:
2293                                 sys.exit()
2294                 retcode = fd.download(all_urls)
2295                 sys.exit(retcode)
2296
2297         except DownloadError:
2298                 sys.exit(1)
2299         except SameFileError:
2300                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2301         except KeyboardInterrupt:
2302                 sys.exit(u'\nERROR: Interrupted by user')