Add support for the Dailymotion
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableFormatError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         outtmpl:        Template for output names.
193         ignoreerrors:   Do not stop on download errors.
194         ratelimit:      Download speed limit, in bytes/sec.
195         nooverwrites:   Prevent overwriting files.
196         retries:        Number of times to retry for HTTP error 503
197         continuedl:     Try to continue downloads if possible.
198         noprogress:     Do not print the progress bar.
199         """
200
201         params = None
202         _ies = []
203         _pps = []
204         _download_retcode = None
205         _num_downloads = None
206
207         def __init__(self, params):
208                 """Create a FileDownloader object with the given options."""
209                 self._ies = []
210                 self._pps = []
211                 self._download_retcode = 0
212                 self._num_downloads = 0
213                 self.params = params
214         
215         @staticmethod
216         def pmkdir(filename):
217                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
218                 components = filename.split(os.sep)
219                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
220                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
221                 for dir in aggregate:
222                         if not os.path.exists(dir):
223                                 os.mkdir(dir)
224         
225         @staticmethod
226         def format_bytes(bytes):
227                 if bytes is None:
228                         return 'N/A'
229                 if type(bytes) is str:
230                         bytes = float(bytes)
231                 if bytes == 0.0:
232                         exponent = 0
233                 else:
234                         exponent = long(math.log(bytes, 1024.0))
235                 suffix = 'bkMGTPEZY'[exponent]
236                 converted = float(bytes) / float(1024**exponent)
237                 return '%.2f%s' % (converted, suffix)
238
239         @staticmethod
240         def calc_percent(byte_counter, data_len):
241                 if data_len is None:
242                         return '---.-%'
243                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
244
245         @staticmethod
246         def calc_eta(start, now, total, current):
247                 if total is None:
248                         return '--:--'
249                 dif = now - start
250                 if current == 0 or dif < 0.001: # One millisecond
251                         return '--:--'
252                 rate = float(current) / dif
253                 eta = long((float(total) - float(current)) / rate)
254                 (eta_mins, eta_secs) = divmod(eta, 60)
255                 if eta_mins > 99:
256                         return '--:--'
257                 return '%02d:%02d' % (eta_mins, eta_secs)
258
259         @staticmethod
260         def calc_speed(start, now, bytes):
261                 dif = now - start
262                 if bytes == 0 or dif < 0.001: # One millisecond
263                         return '%10s' % '---b/s'
264                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
265
266         @staticmethod
267         def best_block_size(elapsed_time, bytes):
268                 new_min = max(bytes / 2.0, 1.0)
269                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
270                 if elapsed_time < 0.001:
271                         return long(new_max)
272                 rate = bytes / elapsed_time
273                 if rate > new_max:
274                         return long(new_max)
275                 if rate < new_min:
276                         return long(new_min)
277                 return long(rate)
278
279         @staticmethod
280         def parse_bytes(bytestr):
281                 """Parse a string indicating a byte quantity into a long integer."""
282                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
283                 if matchobj is None:
284                         return None
285                 number = float(matchobj.group(1))
286                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
287                 return long(round(number * multiplier))
288
289         @staticmethod
290         def verify_url(url):
291                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
292                 request = urllib2.Request(url, None, std_headers)
293                 data = urllib2.urlopen(request)
294                 data.read(1)
295                 url = data.geturl()
296                 data.close()
297                 return url
298
299         def add_info_extractor(self, ie):
300                 """Add an InfoExtractor object to the end of the list."""
301                 self._ies.append(ie)
302                 ie.set_downloader(self)
303         
304         def add_post_processor(self, pp):
305                 """Add a PostProcessor object to the end of the chain."""
306                 self._pps.append(pp)
307                 pp.set_downloader(self)
308         
309         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
310                 """Print message to stdout if not in quiet mode."""
311                 try:
312                         if not self.params.get('quiet', False):
313                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
314                         sys.stdout.flush()
315                 except (UnicodeEncodeError), err:
316                         if not ignore_encoding_errors:
317                                 raise
318         
319         def to_stderr(self, message):
320                 """Print message to stderr."""
321                 print >>sys.stderr, message.encode(preferredencoding())
322         
323         def fixed_template(self):
324                 """Checks if the output template is fixed."""
325                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
326
327         def trouble(self, message=None):
328                 """Determine action to take when a download problem appears.
329
330                 Depending on if the downloader has been configured to ignore
331                 download errors or not, this method may throw an exception or
332                 not when errors are found, after printing the message.
333                 """
334                 if message is not None:
335                         self.to_stderr(message)
336                 if not self.params.get('ignoreerrors', False):
337                         raise DownloadError(message)
338                 self._download_retcode = 1
339
340         def slow_down(self, start_time, byte_counter):
341                 """Sleep if the download speed is over the rate limit."""
342                 rate_limit = self.params.get('ratelimit', None)
343                 if rate_limit is None or byte_counter == 0:
344                         return
345                 now = time.time()
346                 elapsed = now - start_time
347                 if elapsed <= 0.0:
348                         return
349                 speed = float(byte_counter) / elapsed
350                 if speed > rate_limit:
351                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
352
353         def report_destination(self, filename):
354                 """Report destination filename."""
355                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
356         
357         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
358                 """Report download progress."""
359                 if self.params.get('noprogress', False):
360                         return
361                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
362                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
363
364         def report_resuming_byte(self, resume_len):
365                 """Report attemtp to resume at given byte."""
366                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
367         
368         def report_retry(self, count, retries):
369                 """Report retry in case of HTTP error 503"""
370                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
371         
372         def report_file_already_downloaded(self, file_name):
373                 """Report file has already been fully downloaded."""
374                 try:
375                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
376                 except (UnicodeEncodeError), err:
377                         self.to_stdout(u'[download] The file has already been downloaded')
378         
379         def report_unable_to_resume(self):
380                 """Report it was impossible to resume download."""
381                 self.to_stdout(u'[download] Unable to resume')
382         
383         def report_finish(self):
384                 """Report download finished."""
385                 if self.params.get('noprogress', False):
386                         self.to_stdout(u'[download] Download completed')
387                 else:
388                         self.to_stdout(u'')
389
390         def process_info(self, info_dict):
391                 """Process a single dictionary returned by an InfoExtractor."""
392                 # Do nothing else if in simulate mode
393                 if self.params.get('simulate', False):
394                         # Verify URL if it's an HTTP one
395                         if info_dict['url'].startswith('http'):
396                                 try:
397                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
398                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
399                                         raise UnavailableFormatError
400
401                         # Forced printings
402                         if self.params.get('forcetitle', False):
403                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
404                         if self.params.get('forceurl', False):
405                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
406                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
407                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
408                         if self.params.get('forcedescription', False) and 'description' in info_dict:
409                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
410
411                         return
412                         
413                 try:
414                         template_dict = dict(info_dict)
415                         template_dict['epoch'] = unicode(long(time.time()))
416                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
417                         filename = self.params['outtmpl'] % template_dict
418                 except (ValueError, KeyError), err:
419                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
420                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
421                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
422                         return
423
424                 try:
425                         self.pmkdir(filename)
426                 except (OSError, IOError), err:
427                         self.trouble('ERROR: unable to create directories: %s' % str(err))
428                         return
429
430                 try:
431                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
432                 except (OSError, IOError), err:
433                         raise UnavailableFormatError
434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
435                         self.trouble('ERROR: unable to download video data: %s' % str(err))
436                         return
437                 except (ContentTooShortError, ), err:
438                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
439                         return
440
441                 if success:
442                         try:
443                                 self.post_process(filename, info_dict)
444                         except (PostProcessingError), err:
445                                 self.trouble('ERROR: postprocessing: %s' % str(err))
446                                 return
447
448         def download(self, url_list):
449                 """Download a given list of URLs."""
450                 if len(url_list) > 1 and self.fixed_template():
451                         raise SameFileError(self.params['outtmpl'])
452
453                 for url in url_list:
454                         suitable_found = False
455                         for ie in self._ies:
456                                 # Go to next InfoExtractor if not suitable
457                                 if not ie.suitable(url):
458                                         continue
459
460                                 # Suitable InfoExtractor found
461                                 suitable_found = True
462
463                                 # Extract information from URL and process it
464                                 ie.extract(url)
465
466                                 # Suitable InfoExtractor had been found; go to next URL
467                                 break
468
469                         if not suitable_found:
470                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
471
472                 return self._download_retcode
473
474         def post_process(self, filename, ie_info):
475                 """Run the postprocessing chain on the given file."""
476                 info = dict(ie_info)
477                 info['filepath'] = filename
478                 for pp in self._pps:
479                         info = pp.run(info)
480                         if info is None:
481                                 break
482         
483         def _download_with_rtmpdump(self, filename, url, player_url):
484                 self.report_destination(filename)
485
486                 # Check for rtmpdump first
487                 try:
488                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
489                 except (OSError, IOError):
490                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
491                         return False
492
493                 # Download using rtmpdump. rtmpdump returns exit code 2 when
494                 # the connection was interrumpted and resuming appears to be
495                 # possible. This is part of rtmpdump's normal usage, AFAIK.
496                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
497                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
498                 while retval == 2 or retval == 1:
499                         prevsize = os.path.getsize(filename)
500                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
501                         time.sleep(5.0) # This seems to be needed
502                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
503                         cursize = os.path.getsize(filename)
504                         if prevsize == cursize and retval == 1:
505                                 break
506                 if retval == 0:
507                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
508                         return True
509                 else:
510                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
511                         return False
512
513         def _do_download(self, filename, url, player_url):
514                 # Attempt to download using rtmpdump
515                 if url.startswith('rtmp'):
516                         return self._download_with_rtmpdump(filename, url, player_url)
517
518                 stream = None
519                 open_mode = 'wb'
520                 basic_request = urllib2.Request(url, None, std_headers)
521                 request = urllib2.Request(url, None, std_headers)
522
523                 # Establish possible resume length
524                 if os.path.isfile(filename):
525                         resume_len = os.path.getsize(filename)
526                 else:
527                         resume_len = 0
528
529                 # Request parameters in case of being able to resume
530                 if self.params.get('continuedl', False) and resume_len != 0:
531                         self.report_resuming_byte(resume_len)
532                         request.add_header('Range','bytes=%d-' % resume_len)
533                         open_mode = 'ab'
534
535                 count = 0
536                 retries = self.params.get('retries', 0)
537                 while True:
538                         # Establish connection
539                         try:
540                                 data = urllib2.urlopen(request)
541                                 break
542                         except (urllib2.HTTPError, ), err:
543                                 if err.code == 503:
544                                         # Retry in case of HTTP error 503
545                                         count += 1
546                                         if count <= retries:
547                                                 self.report_retry(count, retries)
548                                                 continue
549                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
550                                         raise
551                                 # Unable to resume
552                                 data = urllib2.urlopen(basic_request)
553                                 content_length = data.info()['Content-Length']
554
555                                 if content_length is not None and long(content_length) == resume_len:
556                                         # Because the file had already been fully downloaded
557                                         self.report_file_already_downloaded(filename)
558                                         return True
559                                 else:
560                                         # Because the server didn't let us
561                                         self.report_unable_to_resume()
562                                         open_mode = 'wb'
563
564                 data_len = data.info().get('Content-length', None)
565                 data_len_str = self.format_bytes(data_len)
566                 byte_counter = 0
567                 block_size = 1024
568                 start = time.time()
569                 while True:
570                         # Download and write
571                         before = time.time()
572                         data_block = data.read(block_size)
573                         after = time.time()
574                         data_block_len = len(data_block)
575                         if data_block_len == 0:
576                                 break
577                         byte_counter += data_block_len
578
579                         # Open file just in time
580                         if stream is None:
581                                 try:
582                                         (stream, filename) = sanitize_open(filename, open_mode)
583                                         self.report_destination(filename)
584                                         self._num_downloads += 1
585                                 except (OSError, IOError), err:
586                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
587                                         return False
588                         try:
589                                 stream.write(data_block)
590                         except (IOError, OSError), err:
591                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
592                         block_size = self.best_block_size(after - before, data_block_len)
593
594                         # Progress message
595                         percent_str = self.calc_percent(byte_counter, data_len)
596                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
597                         speed_str = self.calc_speed(start, time.time(), byte_counter)
598                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
599
600                         # Apply rate limit
601                         self.slow_down(start, byte_counter)
602
603                 self.report_finish()
604                 if data_len is not None and str(byte_counter) != data_len:
605                         raise ContentTooShortError(byte_counter, long(data_len))
606                 return True
607
608 class InfoExtractor(object):
609         """Information Extractor class.
610
611         Information extractors are the classes that, given a URL, extract
612         information from the video (or videos) the URL refers to. This
613         information includes the real video URL, the video title and simplified
614         title, author and others. The information is stored in a dictionary
615         which is then passed to the FileDownloader. The FileDownloader
616         processes this information possibly downloading the video to the file
617         system, among other possible outcomes. The dictionaries must include
618         the following fields:
619
620         id:             Video identifier.
621         url:            Final video URL.
622         uploader:       Nickname of the video uploader.
623         title:          Literal title.
624         stitle:         Simplified title.
625         ext:            Video filename extension.
626         format:         Video format.
627         player_url:     SWF Player URL (may be None).
628
629         The following fields are optional. Their primary purpose is to allow
630         youtube-dl to serve as the backend for a video search function, such
631         as the one in youtube2mp3.  They are only used when their respective
632         forced printing functions are called:
633
634         thumbnail:      Full URL to a video thumbnail image.
635         description:    One-line video description.
636
637         Subclasses of this one should re-define the _real_initialize() and
638         _real_extract() methods, as well as the suitable() static method.
639         Probably, they should also be instantiated and added to the main
640         downloader.
641         """
642
643         _ready = False
644         _downloader = None
645
646         def __init__(self, downloader=None):
647                 """Constructor. Receives an optional downloader."""
648                 self._ready = False
649                 self.set_downloader(downloader)
650
651         @staticmethod
652         def suitable(url):
653                 """Receives a URL and returns True if suitable for this IE."""
654                 return False
655
656         def initialize(self):
657                 """Initializes an instance (authentication, etc)."""
658                 if not self._ready:
659                         self._real_initialize()
660                         self._ready = True
661
662         def extract(self, url):
663                 """Extracts URL information and returns it in list of dicts."""
664                 self.initialize()
665                 return self._real_extract(url)
666
667         def set_downloader(self, downloader):
668                 """Sets the downloader for this IE."""
669                 self._downloader = downloader
670         
671         def _real_initialize(self):
672                 """Real initialization process. Redefine in subclasses."""
673                 pass
674
675         def _real_extract(self, url):
676                 """Real extraction process. Redefine in subclasses."""
677                 pass
678
679 class YoutubeIE(InfoExtractor):
680         """Information extractor for youtube.com."""
681
682         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
683         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
684         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
685         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
686         _NETRC_MACHINE = 'youtube'
687         # Listed in order of priority for the -b option
688         _available_formats = ['37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13', None]
689         _video_extensions = {
690                 '13': '3gp',
691                 '17': 'mp4',
692                 '18': 'mp4',
693                 '22': 'mp4',
694                 '37': 'mp4',
695                 '43': 'webm',
696                 '45': 'webm',
697         }
698
699         @staticmethod
700         def suitable(url):
701                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
702
703         def report_lang(self):
704                 """Report attempt to set language."""
705                 self._downloader.to_stdout(u'[youtube] Setting language')
706
707         def report_login(self):
708                 """Report attempt to log in."""
709                 self._downloader.to_stdout(u'[youtube] Logging in')
710         
711         def report_age_confirmation(self):
712                 """Report attempt to confirm age."""
713                 self._downloader.to_stdout(u'[youtube] Confirming age')
714         
715         def report_video_webpage_download(self, video_id):
716                 """Report attempt to download video webpage."""
717                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
718         
719         def report_video_info_webpage_download(self, video_id):
720                 """Report attempt to download video info webpage."""
721                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
722         
723         def report_information_extraction(self, video_id):
724                 """Report attempt to extract video information."""
725                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
726         
727         def report_unavailable_format(self, video_id, format):
728                 """Report extracted video URL."""
729                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
730         
731         def report_rtmp_download(self):
732                 """Indicate the download will use the RTMP protocol."""
733                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
734         
735         def _real_initialize(self):
736                 if self._downloader is None:
737                         return
738
739                 username = None
740                 password = None
741                 downloader_params = self._downloader.params
742
743                 # Attempt to use provided username and password or .netrc data
744                 if downloader_params.get('username', None) is not None:
745                         username = downloader_params['username']
746                         password = downloader_params['password']
747                 elif downloader_params.get('usenetrc', False):
748                         try:
749                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
750                                 if info is not None:
751                                         username = info[0]
752                                         password = info[2]
753                                 else:
754                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
755                         except (IOError, netrc.NetrcParseError), err:
756                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
757                                 return
758
759                 # Set language
760                 request = urllib2.Request(self._LANG_URL, None, std_headers)
761                 try:
762                         self.report_lang()
763                         urllib2.urlopen(request).read()
764                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
765                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
766                         return
767
768                 # No authentication to be performed
769                 if username is None:
770                         return
771
772                 # Log in
773                 login_form = {
774                                 'current_form': 'loginForm',
775                                 'next':         '/',
776                                 'action_login': 'Log In',
777                                 'username':     username,
778                                 'password':     password,
779                                 }
780                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
781                 try:
782                         self.report_login()
783                         login_results = urllib2.urlopen(request).read()
784                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
785                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
786                                 return
787                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
788                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
789                         return
790         
791                 # Confirm age
792                 age_form = {
793                                 'next_url':             '/',
794                                 'action_confirm':       'Confirm',
795                                 }
796                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
797                 try:
798                         self.report_age_confirmation()
799                         age_results = urllib2.urlopen(request).read()
800                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
801                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
802                         return
803
804         def _real_extract(self, url):
805                 # Extract video id from URL
806                 mobj = re.match(self._VALID_URL, url)
807                 if mobj is None:
808                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
809                         return
810                 video_id = mobj.group(2)
811
812                 # Downloader parameters
813                 best_quality = False
814                 all_formats = False
815                 format_param = None
816                 quality_index = 0
817                 if self._downloader is not None:
818                         params = self._downloader.params
819                         format_param = params.get('format', None)
820                         if format_param == '0':
821                                 format_param = self._available_formats[quality_index]
822                                 best_quality = True
823                         elif format_param == '-1':
824                                 format_param = self._available_formats[quality_index]
825                                 all_formats = True
826
827                 while True:
828                         # Extension
829                         video_extension = self._video_extensions.get(format_param, 'flv')
830
831                         # Get video webpage
832                         self.report_video_webpage_download(video_id)
833                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
834                         try:
835                                 video_webpage = urllib2.urlopen(request).read()
836                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
837                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
838                                 return
839
840                         # Attempt to extract SWF player URL
841                         mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
842                         if mobj is not None:
843                                 player_url = mobj.group(1)
844                         else:
845                                 player_url = None
846
847                         # Get video info
848                         self.report_video_info_webpage_download(video_id)
849                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
850                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
851                                                    % (video_id, el_type))
852                                 request = urllib2.Request(video_info_url, None, std_headers)
853                                 try:
854                                         video_info_webpage = urllib2.urlopen(request).read()
855                                         video_info = parse_qs(video_info_webpage)
856                                         if 'token' in video_info:
857                                                 break
858                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
859                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
860                                         return
861                         self.report_information_extraction(video_id)
862
863                         # "t" param
864                         if 'token' not in video_info:
865                                 # Attempt to see if YouTube has issued an error message
866                                 if 'reason' not in video_info:
867                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
868                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
869                                         stream.write(video_info_webpage)
870                                         stream.close()
871                                 else:
872                                         reason = urllib.unquote_plus(video_info['reason'][0])
873                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
874                                 return
875                         token = urllib.unquote_plus(video_info['token'][0])
876                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
877                         if format_param is not None:
878                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
879
880                         # Check possible RTMP download
881                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
882                                 self.report_rtmp_download()
883                                 video_real_url = video_info['conn'][0]
884
885                         # uploader
886                         if 'author' not in video_info:
887                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
888                                 return
889                         video_uploader = urllib.unquote_plus(video_info['author'][0])
890
891                         # title
892                         if 'title' not in video_info:
893                                 self._downloader.trouble(u'ERROR: unable to extract video title')
894                                 return
895                         video_title = urllib.unquote_plus(video_info['title'][0])
896                         video_title = video_title.decode('utf-8')
897                         video_title = sanitize_title(video_title)
898
899                         # simplified title
900                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
901                         simple_title = simple_title.strip(ur'_')
902
903                         # thumbnail image
904                         if 'thumbnail_url' not in video_info:
905                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
906                                 video_thumbnail = ''
907                         else:   # don't panic if we can't find it
908                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
909
910                         # description
911                         video_description = 'No description available.'
912                         if self._downloader.params.get('forcedescription', False):
913                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
914                                 if mobj is not None:
915                                         video_description = mobj.group(1)
916
917                         try:
918                                 # Process video information
919                                 self._downloader.process_info({
920                                         'id':           video_id.decode('utf-8'),
921                                         'url':          video_real_url.decode('utf-8'),
922                                         'uploader':     video_uploader.decode('utf-8'),
923                                         'title':        video_title,
924                                         'stitle':       simple_title,
925                                         'ext':          video_extension.decode('utf-8'),
926                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
927                                         'thumbnail':    video_thumbnail.decode('utf-8'),
928                                         'description':  video_description.decode('utf-8'),
929                                         'player_url':   player_url,
930                                 })
931
932                                 if all_formats:
933                                         quality_index += 1
934                                         if quality_index == len(self._available_formats):
935                                                 # None left to get
936                                                 return
937                                         else:
938                                                 format_param = self._available_formats[quality_index]
939                                                 continue
940                                 return
941
942                         except UnavailableFormatError, err:
943                                 if best_quality or all_formats:
944                                         quality_index += 1
945                                         if quality_index == len(self._available_formats):
946                                                 # I don't ever expect this to happen
947                                                 if not all_formats:
948                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
949                                                 return
950                                         else:
951                                                 self.report_unavailable_format(video_id, format_param)
952                                                 format_param = self._available_formats[quality_index]
953                                                 continue
954                                 else: 
955                                         self._downloader.trouble('ERROR: format not available for video')
956                                         return
957
958
959 class MetacafeIE(InfoExtractor):
960         """Information Extractor for metacafe.com."""
961
962         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
963         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
964         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
965         _youtube_ie = None
966
967         def __init__(self, youtube_ie, downloader=None):
968                 InfoExtractor.__init__(self, downloader)
969                 self._youtube_ie = youtube_ie
970
971         @staticmethod
972         def suitable(url):
973                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
974
975         def report_disclaimer(self):
976                 """Report disclaimer retrieval."""
977                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
978
979         def report_age_confirmation(self):
980                 """Report attempt to confirm age."""
981                 self._downloader.to_stdout(u'[metacafe] Confirming age')
982         
983         def report_download_webpage(self, video_id):
984                 """Report webpage download."""
985                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
986         
987         def report_extraction(self, video_id):
988                 """Report information extraction."""
989                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
990
991         def _real_initialize(self):
992                 # Retrieve disclaimer
993                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
994                 try:
995                         self.report_disclaimer()
996                         disclaimer = urllib2.urlopen(request).read()
997                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
998                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
999                         return
1000
1001                 # Confirm age
1002                 disclaimer_form = {
1003                         'filters': '0',
1004                         'submit': "Continue - I'm over 18",
1005                         }
1006                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1007                 try:
1008                         self.report_age_confirmation()
1009                         disclaimer = urllib2.urlopen(request).read()
1010                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1011                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1012                         return
1013         
1014         def _real_extract(self, url):
1015                 # Extract id and simplified title from URL
1016                 mobj = re.match(self._VALID_URL, url)
1017                 if mobj is None:
1018                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1019                         return
1020
1021                 video_id = mobj.group(1)
1022
1023                 # Check if video comes from YouTube
1024                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1025                 if mobj2 is not None:
1026                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1027                         return
1028
1029                 simple_title = mobj.group(2).decode('utf-8')
1030                 video_extension = 'flv'
1031
1032                 # Retrieve video webpage to extract further information
1033                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1034                 try:
1035                         self.report_download_webpage(video_id)
1036                         webpage = urllib2.urlopen(request).read()
1037                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1038                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1039                         return
1040
1041                 # Extract URL, uploader and title from webpage
1042                 self.report_extraction(video_id)
1043                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1044                 if mobj is None:
1045                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1046                         return
1047                 mediaURL = urllib.unquote(mobj.group(1))
1048
1049                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1050                 #if mobj is None:
1051                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1052                 #       return
1053                 #gdaKey = mobj.group(1)
1054                 #
1055                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1056
1057                 video_url = mediaURL
1058
1059                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1060                 if mobj is None:
1061                         self._downloader.trouble(u'ERROR: unable to extract title')
1062                         return
1063                 video_title = mobj.group(1).decode('utf-8')
1064                 video_title = sanitize_title(video_title)
1065
1066                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1067                 if mobj is None:
1068                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1069                         return
1070                 video_uploader = mobj.group(1)
1071
1072                 try:
1073                         # Process video information
1074                         self._downloader.process_info({
1075                                 'id':           video_id.decode('utf-8'),
1076                                 'url':          video_url.decode('utf-8'),
1077                                 'uploader':     video_uploader.decode('utf-8'),
1078                                 'title':        video_title,
1079                                 'stitle':       simple_title,
1080                                 'ext':          video_extension.decode('utf-8'),
1081                                 'format':       u'NA',
1082                                 'player_url':   None,
1083                         })
1084                 except UnavailableFormatError:
1085                         self._downloader.trouble(u'ERROR: format not available for video')
1086
1087
1088 class DailymotionIE(InfoExtractor):
1089         """Information Extractor for Dailymotion"""
1090
1091         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1092         _DISCLAIMER = ''
1093         _FILTER_POST = ''
1094
1095         def __init__(self, downloader=None):
1096                 InfoExtractor.__init__(self, downloader)
1097
1098         @staticmethod
1099         def suitable(url):
1100                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1101
1102         def report_disclaimer(self):
1103                 """Report disclaimer retrieval."""
1104                 self._downloader.to_stdout(u'[dailymotion] Retrieving disclaimer')
1105
1106         def report_age_confirmation(self):
1107                 """Report attempt to confirm age."""
1108                 self._downloader.to_stdout(u'[dailymotion] Confirming age')
1109         
1110         def report_download_webpage(self, video_id):
1111                 """Report webpage download."""
1112                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1113         
1114         def report_extraction(self, video_id):
1115                 """Report information extraction."""
1116                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1117
1118         def _real_initialize(self):
1119                 return
1120
1121                 # Retrieve disclaimer
1122                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1123                 try:
1124                         self.report_disclaimer()
1125                         disclaimer = urllib2.urlopen(request).read()
1126                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1127                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1128                         return
1129
1130                 # Confirm age
1131                 disclaimer_form = {
1132                         'filters': '0',
1133                         'submit': "Continue - I'm over 18",
1134                         }
1135                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1136                 try:
1137                         self.report_age_confirmation()
1138                         disclaimer = urllib2.urlopen(request).read()
1139                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1140                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1141                         return
1142         
1143         def _real_extract(self, url):
1144                 # Extract id and simplified title from URL
1145                 mobj = re.match(self._VALID_URL, url)
1146                 if mobj is None:
1147                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1148                         return
1149
1150                 video_id = mobj.group(1)
1151
1152                 simple_title = mobj.group(2).decode('utf-8')
1153                 video_extension = 'flv'
1154
1155                 # Retrieve video webpage to extract further information
1156                 request = urllib2.Request(url)
1157                 try:
1158                         self.report_download_webpage(video_id)
1159                         webpage = urllib2.urlopen(request).read()
1160                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1161                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1162                         return
1163
1164                 # Extract URL, uploader and title from webpage
1165                 self.report_extraction(video_id)
1166                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1167                 if mobj is None:
1168                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1169                         return
1170                 mediaURL = urllib.unquote(mobj.group(1))
1171
1172                 # if needed add http://www.dailymotion.com/ if relative URL
1173
1174                 video_url = mediaURL
1175
1176                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1177                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1178                 if mobj is None:
1179                         self._downloader.trouble(u'ERROR: unable to extract title')
1180                         return
1181                 video_title = mobj.group(1).decode('utf-8')
1182                 video_title = sanitize_title(video_title)
1183
1184                 mobj = re.search(r'(?im)<div class="dmco_html owner"><a class="name" href="/.+">(.+?)</a></div>', webpage)
1185                 if mobj is None:
1186                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1187                         return
1188                 video_uploader = mobj.group(1)
1189
1190                 try:
1191                         # Process video information
1192                         self._downloader.process_info({
1193                                 'id':           video_id.decode('utf-8'),
1194                                 'url':          video_url.decode('utf-8'),
1195                                 'uploader':     video_uploader.decode('utf-8'),
1196                                 'title':        video_title,
1197                                 'stitle':       simple_title,
1198                                 'ext':          video_extension.decode('utf-8'),
1199                                 'format':       u'NA',
1200                                 'player_url':   None,
1201                         })
1202                 except UnavailableFormatError:
1203                         self._downloader.trouble(u'ERROR: format not available for video')
1204
1205 class GoogleIE(InfoExtractor):
1206         """Information extractor for video.google.com."""
1207
1208         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1209
1210         def __init__(self, downloader=None):
1211                 InfoExtractor.__init__(self, downloader)
1212
1213         @staticmethod
1214         def suitable(url):
1215                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1216
1217         def report_download_webpage(self, video_id):
1218                 """Report webpage download."""
1219                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1220
1221         def report_extraction(self, video_id):
1222                 """Report information extraction."""
1223                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1224
1225         def _real_initialize(self):
1226                 return
1227
1228         def _real_extract(self, url):
1229                 # Extract id from URL
1230                 mobj = re.match(self._VALID_URL, url)
1231                 if mobj is None:
1232                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1233                         return
1234
1235                 video_id = mobj.group(1)
1236
1237                 video_extension = 'mp4'
1238
1239                 # Retrieve video webpage to extract further information
1240                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1241                 try:
1242                         self.report_download_webpage(video_id)
1243                         webpage = urllib2.urlopen(request).read()
1244                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1245                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1246                         return
1247
1248                 # Extract URL, uploader, and title from webpage
1249                 self.report_extraction(video_id)
1250                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1251                 if mobj is None:
1252                         video_extension = 'flv'
1253                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1254                 if mobj is None:
1255                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1256                         return
1257                 mediaURL = urllib.unquote(mobj.group(1))
1258                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1259                 mediaURL = mediaURL.replace('\\x26', '\x26')
1260
1261                 video_url = mediaURL
1262
1263                 mobj = re.search(r'<title>(.*)</title>', webpage)
1264                 if mobj is None:
1265                         self._downloader.trouble(u'ERROR: unable to extract title')
1266                         return
1267                 video_title = mobj.group(1).decode('utf-8')
1268                 video_title = sanitize_title(video_title)
1269                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1270
1271                 # Extract video description
1272                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1273                 if mobj is None:
1274                         self._downloader.trouble(u'ERROR: unable to extract video description')
1275                         return
1276                 video_description = mobj.group(1).decode('utf-8')
1277                 if not video_description:
1278                         video_description = 'No description available.'
1279
1280                 # Extract video thumbnail
1281                 if self._downloader.params.get('forcethumbnail', False):
1282                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1283                         try:
1284                                 webpage = urllib2.urlopen(request).read()
1285                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1286                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1287                                 return
1288                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1289                         if mobj is None:
1290                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1291                                 return
1292                         video_thumbnail = mobj.group(1)
1293                 else:   # we need something to pass to process_info
1294                         video_thumbnail = ''
1295
1296
1297                 try:
1298                         # Process video information
1299                         self._downloader.process_info({
1300                                 'id':           video_id.decode('utf-8'),
1301                                 'url':          video_url.decode('utf-8'),
1302                                 'uploader':     u'NA',
1303                                 'title':        video_title,
1304                                 'stitle':       simple_title,
1305                                 'ext':          video_extension.decode('utf-8'),
1306                                 'format':       u'NA',
1307                                 'player_url':   None,
1308                         })
1309                 except UnavailableFormatError:
1310                         self._downloader.trouble(u'ERROR: format not available for video')
1311
1312
1313 class PhotobucketIE(InfoExtractor):
1314         """Information extractor for photobucket.com."""
1315
1316         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1317
1318         def __init__(self, downloader=None):
1319                 InfoExtractor.__init__(self, downloader)
1320
1321         @staticmethod
1322         def suitable(url):
1323                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1324
1325         def report_download_webpage(self, video_id):
1326                 """Report webpage download."""
1327                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1328
1329         def report_extraction(self, video_id):
1330                 """Report information extraction."""
1331                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1332
1333         def _real_initialize(self):
1334                 return
1335
1336         def _real_extract(self, url):
1337                 # Extract id from URL
1338                 mobj = re.match(self._VALID_URL, url)
1339                 if mobj is None:
1340                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1341                         return
1342
1343                 video_id = mobj.group(1)
1344
1345                 video_extension = 'flv'
1346
1347                 # Retrieve video webpage to extract further information
1348                 request = urllib2.Request(url)
1349                 try:
1350                         self.report_download_webpage(video_id)
1351                         webpage = urllib2.urlopen(request).read()
1352                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1353                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1354                         return
1355
1356                 # Extract URL, uploader, and title from webpage
1357                 self.report_extraction(video_id)
1358                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1359                 if mobj is None:
1360                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1361                         return
1362                 mediaURL = urllib.unquote(mobj.group(1))
1363
1364                 video_url = mediaURL
1365
1366                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1367                 if mobj is None:
1368                         self._downloader.trouble(u'ERROR: unable to extract title')
1369                         return
1370                 video_title = mobj.group(1).decode('utf-8')
1371                 video_title = sanitize_title(video_title)
1372                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1373
1374                 video_uploader = mobj.group(2).decode('utf-8')
1375
1376                 try:
1377                         # Process video information
1378                         self._downloader.process_info({
1379                                 'id':           video_id.decode('utf-8'),
1380                                 'url':          video_url.decode('utf-8'),
1381                                 'uploader':     video_uploader,
1382                                 'title':        video_title,
1383                                 'stitle':       simple_title,
1384                                 'ext':          video_extension.decode('utf-8'),
1385                                 'format':       u'NA',
1386                                 'player_url':   None,
1387                         })
1388                 except UnavailableFormatError:
1389                         self._downloader.trouble(u'ERROR: format not available for video')
1390
1391
1392 class YahooIE(InfoExtractor):
1393         """Information extractor for video.yahoo.com."""
1394
1395         # _VALID_URL matches all Yahoo! Video URLs
1396         # _VPAGE_URL matches only the extractable '/watch/' URLs
1397         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1398         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1399
1400         def __init__(self, downloader=None):
1401                 InfoExtractor.__init__(self, downloader)
1402
1403         @staticmethod
1404         def suitable(url):
1405                 return (re.match(YahooIE._VALID_URL, url) is not None)
1406
1407         def report_download_webpage(self, video_id):
1408                 """Report webpage download."""
1409                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1410
1411         def report_extraction(self, video_id):
1412                 """Report information extraction."""
1413                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1414
1415         def _real_initialize(self):
1416                 return
1417
1418         def _real_extract(self, url):
1419                 # Extract ID from URL
1420                 mobj = re.match(self._VALID_URL, url)
1421                 if mobj is None:
1422                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1423                         return
1424
1425                 video_id = mobj.group(2)
1426                 video_extension = 'flv'
1427
1428                 # Rewrite valid but non-extractable URLs as
1429                 # extractable English language /watch/ URLs
1430                 if re.match(self._VPAGE_URL, url) is None:
1431                         request = urllib2.Request(url)
1432                         try:
1433                                 webpage = urllib2.urlopen(request).read()
1434                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1435                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1436                                 return
1437
1438                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1439                         if mobj is None:
1440                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1441                                 return
1442                         yahoo_id = mobj.group(1)
1443
1444                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1445                         if mobj is None:
1446                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1447                                 return
1448                         yahoo_vid = mobj.group(1)
1449
1450                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1451                         return self._real_extract(url)
1452
1453                 # Retrieve video webpage to extract further information
1454                 request = urllib2.Request(url)
1455                 try:
1456                         self.report_download_webpage(video_id)
1457                         webpage = urllib2.urlopen(request).read()
1458                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1459                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1460                         return
1461
1462                 # Extract uploader and title from webpage
1463                 self.report_extraction(video_id)
1464                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1465                 if mobj is None:
1466                         self._downloader.trouble(u'ERROR: unable to extract video title')
1467                         return
1468                 video_title = mobj.group(1).decode('utf-8')
1469                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1470
1471                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1472                 if mobj is None:
1473                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1474                         return
1475                 video_uploader = mobj.group(1).decode('utf-8')
1476
1477                 # Extract video thumbnail
1478                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1479                 if mobj is None:
1480                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1481                         return
1482                 video_thumbnail = mobj.group(1).decode('utf-8')
1483
1484                 # Extract video description
1485                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1486                 if mobj is None:
1487                         self._downloader.trouble(u'ERROR: unable to extract video description')
1488                         return
1489                 video_description = mobj.group(1).decode('utf-8')
1490                 if not video_description: video_description = 'No description available.'
1491
1492                 # Extract video height and width
1493                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1494                 if mobj is None:
1495                         self._downloader.trouble(u'ERROR: unable to extract video height')
1496                         return
1497                 yv_video_height = mobj.group(1)
1498
1499                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1500                 if mobj is None:
1501                         self._downloader.trouble(u'ERROR: unable to extract video width')
1502                         return
1503                 yv_video_width = mobj.group(1)
1504
1505                 # Retrieve video playlist to extract media URL
1506                 # I'm not completely sure what all these options are, but we
1507                 # seem to need most of them, otherwise the server sends a 401.
1508                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1509                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1510                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1511                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1512                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1513                 try:
1514                         self.report_download_webpage(video_id)
1515                         webpage = urllib2.urlopen(request).read()
1516                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1517                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1518                         return
1519
1520                 # Extract media URL from playlist XML
1521                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1522                 if mobj is None:
1523                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1524                         return
1525                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1526                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1527
1528                 try:
1529                         # Process video information
1530                         self._downloader.process_info({
1531                                 'id':           video_id.decode('utf-8'),
1532                                 'url':          video_url,
1533                                 'uploader':     video_uploader,
1534                                 'title':        video_title,
1535                                 'stitle':       simple_title,
1536                                 'ext':          video_extension.decode('utf-8'),
1537                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1538                                 'description':  video_description,
1539                                 'thumbnail':    video_thumbnail,
1540                                 'description':  video_description,
1541                                 'player_url':   None,
1542                         })
1543                 except UnavailableFormatError:
1544                         self._downloader.trouble(u'ERROR: format not available for video')
1545
1546
1547 class GenericIE(InfoExtractor):
1548         """Generic last-resort information extractor."""
1549
1550         def __init__(self, downloader=None):
1551                 InfoExtractor.__init__(self, downloader)
1552
1553         @staticmethod
1554         def suitable(url):
1555                 return True
1556
1557         def report_download_webpage(self, video_id):
1558                 """Report webpage download."""
1559                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1560                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1561
1562         def report_extraction(self, video_id):
1563                 """Report information extraction."""
1564                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1565
1566         def _real_initialize(self):
1567                 return
1568
1569         def _real_extract(self, url):
1570                 video_id = url.split('/')[-1]
1571                 request = urllib2.Request(url)
1572                 try:
1573                         self.report_download_webpage(video_id)
1574                         webpage = urllib2.urlopen(request).read()
1575                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1576                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1577                         return
1578                 except ValueError, err:
1579                         # since this is the last-resort InfoExtractor, if
1580                         # this error is thrown, it'll be thrown here
1581                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1582                         return
1583
1584                 # Start with something easy: JW Player in SWFObject
1585                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1586                 if mobj is None:
1587                         # Broaden the search a little bit
1588                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1589                 if mobj is None:
1590                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1591                         return
1592
1593                 # It's possible that one of the regexes
1594                 # matched, but returned an empty group:
1595                 if mobj.group(1) is None:
1596                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1597                         return
1598
1599                 video_url = urllib.unquote(mobj.group(1))
1600                 video_id  = os.path.basename(video_url)
1601
1602                 # here's a fun little line of code for you:
1603                 video_extension = os.path.splitext(video_id)[1][1:]
1604                 video_id        = os.path.splitext(video_id)[0]
1605
1606                 # it's tempting to parse this further, but you would
1607                 # have to take into account all the variations like
1608                 #   Video Title - Site Name
1609                 #   Site Name | Video Title
1610                 #   Video Title - Tagline | Site Name
1611                 # and so on and so forth; it's just not practical
1612                 mobj = re.search(r'<title>(.*)</title>', webpage)
1613                 if mobj is None:
1614                         self._downloader.trouble(u'ERROR: unable to extract title')
1615                         return
1616                 video_title = mobj.group(1).decode('utf-8')
1617                 video_title = sanitize_title(video_title)
1618                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1619
1620                 # video uploader is domain name
1621                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1622                 if mobj is None:
1623                         self._downloader.trouble(u'ERROR: unable to extract title')
1624                         return
1625                 video_uploader = mobj.group(1).decode('utf-8')
1626
1627                 try:
1628                         # Process video information
1629                         self._downloader.process_info({
1630                                 'id':           video_id.decode('utf-8'),
1631                                 'url':          video_url.decode('utf-8'),
1632                                 'uploader':     video_uploader,
1633                                 'title':        video_title,
1634                                 'stitle':       simple_title,
1635                                 'ext':          video_extension.decode('utf-8'),
1636                                 'format':       u'NA',
1637                                 'player_url':   None,
1638                         })
1639                 except UnavailableFormatError:
1640                         self._downloader.trouble(u'ERROR: format not available for video')
1641
1642
1643 class YoutubeSearchIE(InfoExtractor):
1644         """Information Extractor for YouTube search queries."""
1645         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1646         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1647         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1648         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1649         _youtube_ie = None
1650         _max_youtube_results = 1000
1651
1652         def __init__(self, youtube_ie, downloader=None):
1653                 InfoExtractor.__init__(self, downloader)
1654                 self._youtube_ie = youtube_ie
1655         
1656         @staticmethod
1657         def suitable(url):
1658                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1659
1660         def report_download_page(self, query, pagenum):
1661                 """Report attempt to download playlist page with given number."""
1662                 query = query.decode(preferredencoding())
1663                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1664
1665         def _real_initialize(self):
1666                 self._youtube_ie.initialize()
1667         
1668         def _real_extract(self, query):
1669                 mobj = re.match(self._VALID_QUERY, query)
1670                 if mobj is None:
1671                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1672                         return
1673
1674                 prefix, query = query.split(':')
1675                 prefix = prefix[8:]
1676                 query  = query.encode('utf-8')
1677                 if prefix == '':
1678                         self._download_n_results(query, 1)
1679                         return
1680                 elif prefix == 'all':
1681                         self._download_n_results(query, self._max_youtube_results)
1682                         return
1683                 else:
1684                         try:
1685                                 n = long(prefix)
1686                                 if n <= 0:
1687                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1688                                         return
1689                                 elif n > self._max_youtube_results:
1690                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1691                                         n = self._max_youtube_results
1692                                 self._download_n_results(query, n)
1693                                 return
1694                         except ValueError: # parsing prefix as integer fails
1695                                 self._download_n_results(query, 1)
1696                                 return
1697
1698         def _download_n_results(self, query, n):
1699                 """Downloads a specified number of results for a query"""
1700
1701                 video_ids = []
1702                 already_seen = set()
1703                 pagenum = 1
1704
1705                 while True:
1706                         self.report_download_page(query, pagenum)
1707                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1708                         request = urllib2.Request(result_url, None, std_headers)
1709                         try:
1710                                 page = urllib2.urlopen(request).read()
1711                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1712                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1713                                 return
1714
1715                         # Extract video identifiers
1716                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1717                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1718                                 if video_id not in already_seen:
1719                                         video_ids.append(video_id)
1720                                         already_seen.add(video_id)
1721                                         if len(video_ids) == n:
1722                                                 # Specified n videos reached
1723                                                 for id in video_ids:
1724                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1725                                                 return
1726
1727                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1728                                 for id in video_ids:
1729                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1730                                 return
1731
1732                         pagenum = pagenum + 1
1733
1734 class GoogleSearchIE(InfoExtractor):
1735         """Information Extractor for Google Video search queries."""
1736         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1737         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1738         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1739         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1740         _google_ie = None
1741         _max_google_results = 1000
1742
1743         def __init__(self, google_ie, downloader=None):
1744                 InfoExtractor.__init__(self, downloader)
1745                 self._google_ie = google_ie
1746         
1747         @staticmethod
1748         def suitable(url):
1749                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1750
1751         def report_download_page(self, query, pagenum):
1752                 """Report attempt to download playlist page with given number."""
1753                 query = query.decode(preferredencoding())
1754                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1755
1756         def _real_initialize(self):
1757                 self._google_ie.initialize()
1758         
1759         def _real_extract(self, query):
1760                 mobj = re.match(self._VALID_QUERY, query)
1761                 if mobj is None:
1762                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1763                         return
1764
1765                 prefix, query = query.split(':')
1766                 prefix = prefix[8:]
1767                 query  = query.encode('utf-8')
1768                 if prefix == '':
1769                         self._download_n_results(query, 1)
1770                         return
1771                 elif prefix == 'all':
1772                         self._download_n_results(query, self._max_google_results)
1773                         return
1774                 else:
1775                         try:
1776                                 n = long(prefix)
1777                                 if n <= 0:
1778                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1779                                         return
1780                                 elif n > self._max_google_results:
1781                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1782                                         n = self._max_google_results
1783                                 self._download_n_results(query, n)
1784                                 return
1785                         except ValueError: # parsing prefix as integer fails
1786                                 self._download_n_results(query, 1)
1787                                 return
1788
1789         def _download_n_results(self, query, n):
1790                 """Downloads a specified number of results for a query"""
1791
1792                 video_ids = []
1793                 already_seen = set()
1794                 pagenum = 1
1795
1796                 while True:
1797                         self.report_download_page(query, pagenum)
1798                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1799                         request = urllib2.Request(result_url, None, std_headers)
1800                         try:
1801                                 page = urllib2.urlopen(request).read()
1802                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1803                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1804                                 return
1805
1806                         # Extract video identifiers
1807                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1808                                 video_id = mobj.group(1)
1809                                 if video_id not in already_seen:
1810                                         video_ids.append(video_id)
1811                                         already_seen.add(video_id)
1812                                         if len(video_ids) == n:
1813                                                 # Specified n videos reached
1814                                                 for id in video_ids:
1815                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1816                                                 return
1817
1818                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1819                                 for id in video_ids:
1820                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1821                                 return
1822
1823                         pagenum = pagenum + 1
1824
1825 class YahooSearchIE(InfoExtractor):
1826         """Information Extractor for Yahoo! Video search queries."""
1827         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1828         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1829         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1830         _MORE_PAGES_INDICATOR = r'\s*Next'
1831         _yahoo_ie = None
1832         _max_yahoo_results = 1000
1833
1834         def __init__(self, yahoo_ie, downloader=None):
1835                 InfoExtractor.__init__(self, downloader)
1836                 self._yahoo_ie = yahoo_ie
1837         
1838         @staticmethod
1839         def suitable(url):
1840                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1841
1842         def report_download_page(self, query, pagenum):
1843                 """Report attempt to download playlist page with given number."""
1844                 query = query.decode(preferredencoding())
1845                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1846
1847         def _real_initialize(self):
1848                 self._yahoo_ie.initialize()
1849         
1850         def _real_extract(self, query):
1851                 mobj = re.match(self._VALID_QUERY, query)
1852                 if mobj is None:
1853                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1854                         return
1855
1856                 prefix, query = query.split(':')
1857                 prefix = prefix[8:]
1858                 query  = query.encode('utf-8')
1859                 if prefix == '':
1860                         self._download_n_results(query, 1)
1861                         return
1862                 elif prefix == 'all':
1863                         self._download_n_results(query, self._max_yahoo_results)
1864                         return
1865                 else:
1866                         try:
1867                                 n = long(prefix)
1868                                 if n <= 0:
1869                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1870                                         return
1871                                 elif n > self._max_yahoo_results:
1872                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1873                                         n = self._max_yahoo_results
1874                                 self._download_n_results(query, n)
1875                                 return
1876                         except ValueError: # parsing prefix as integer fails
1877                                 self._download_n_results(query, 1)
1878                                 return
1879
1880         def _download_n_results(self, query, n):
1881                 """Downloads a specified number of results for a query"""
1882
1883                 video_ids = []
1884                 already_seen = set()
1885                 pagenum = 1
1886
1887                 while True:
1888                         self.report_download_page(query, pagenum)
1889                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1890                         request = urllib2.Request(result_url, None, std_headers)
1891                         try:
1892                                 page = urllib2.urlopen(request).read()
1893                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1894                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1895                                 return
1896
1897                         # Extract video identifiers
1898                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1899                                 video_id = mobj.group(1)
1900                                 if video_id not in already_seen:
1901                                         video_ids.append(video_id)
1902                                         already_seen.add(video_id)
1903                                         if len(video_ids) == n:
1904                                                 # Specified n videos reached
1905                                                 for id in video_ids:
1906                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1907                                                 return
1908
1909                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1910                                 for id in video_ids:
1911                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1912                                 return
1913
1914                         pagenum = pagenum + 1
1915
1916 class YoutubePlaylistIE(InfoExtractor):
1917         """Information Extractor for YouTube playlists."""
1918
1919         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1920         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1921         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1922         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1923         _youtube_ie = None
1924
1925         def __init__(self, youtube_ie, downloader=None):
1926                 InfoExtractor.__init__(self, downloader)
1927                 self._youtube_ie = youtube_ie
1928         
1929         @staticmethod
1930         def suitable(url):
1931                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1932
1933         def report_download_page(self, playlist_id, pagenum):
1934                 """Report attempt to download playlist page with given number."""
1935                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1936
1937         def _real_initialize(self):
1938                 self._youtube_ie.initialize()
1939         
1940         def _real_extract(self, url):
1941                 # Extract playlist id
1942                 mobj = re.match(self._VALID_URL, url)
1943                 if mobj is None:
1944                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1945                         return
1946
1947                 # Download playlist pages
1948                 playlist_id = mobj.group(1)
1949                 video_ids = []
1950                 pagenum = 1
1951
1952                 while True:
1953                         self.report_download_page(playlist_id, pagenum)
1954                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1955                         try:
1956                                 page = urllib2.urlopen(request).read()
1957                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1958                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1959                                 return
1960
1961                         # Extract video identifiers
1962                         ids_in_page = []
1963                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1964                                 if mobj.group(1) not in ids_in_page:
1965                                         ids_in_page.append(mobj.group(1))
1966                         video_ids.extend(ids_in_page)
1967
1968                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1969                                 break
1970                         pagenum = pagenum + 1
1971
1972                 for id in video_ids:
1973                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1974                 return
1975
1976 class YoutubeUserIE(InfoExtractor):
1977         """Information Extractor for YouTube users."""
1978
1979         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1980         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1981         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1982         _youtube_ie = None
1983
1984         def __init__(self, youtube_ie, downloader=None):
1985                 InfoExtractor.__init__(self, downloader)
1986                 self._youtube_ie = youtube_ie
1987         
1988         @staticmethod
1989         def suitable(url):
1990                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1991
1992         def report_download_page(self, username):
1993                 """Report attempt to download user page."""
1994                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1995
1996         def _real_initialize(self):
1997                 self._youtube_ie.initialize()
1998         
1999         def _real_extract(self, url):
2000                 # Extract username
2001                 mobj = re.match(self._VALID_URL, url)
2002                 if mobj is None:
2003                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2004                         return
2005
2006                 # Download user page
2007                 username = mobj.group(1)
2008                 video_ids = []
2009                 pagenum = 1
2010
2011                 self.report_download_page(username)
2012                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2013                 try:
2014                         page = urllib2.urlopen(request).read()
2015                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2017                         return
2018
2019                 # Extract video identifiers
2020                 ids_in_page = []
2021
2022                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2023                         if mobj.group(1) not in ids_in_page:
2024                                 ids_in_page.append(mobj.group(1))
2025                 video_ids.extend(ids_in_page)
2026
2027                 for id in video_ids:
2028                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2029                 return
2030
2031 class PostProcessor(object):
2032         """Post Processor class.
2033
2034         PostProcessor objects can be added to downloaders with their
2035         add_post_processor() method. When the downloader has finished a
2036         successful download, it will take its internal chain of PostProcessors
2037         and start calling the run() method on each one of them, first with
2038         an initial argument and then with the returned value of the previous
2039         PostProcessor.
2040
2041         The chain will be stopped if one of them ever returns None or the end
2042         of the chain is reached.
2043
2044         PostProcessor objects follow a "mutual registration" process similar
2045         to InfoExtractor objects.
2046         """
2047
2048         _downloader = None
2049
2050         def __init__(self, downloader=None):
2051                 self._downloader = downloader
2052
2053         def set_downloader(self, downloader):
2054                 """Sets the downloader for this PP."""
2055                 self._downloader = downloader
2056         
2057         def run(self, information):
2058                 """Run the PostProcessor.
2059
2060                 The "information" argument is a dictionary like the ones
2061                 composed by InfoExtractors. The only difference is that this
2062                 one has an extra field called "filepath" that points to the
2063                 downloaded file.
2064
2065                 When this method returns None, the postprocessing chain is
2066                 stopped. However, this method may return an information
2067                 dictionary that will be passed to the next postprocessing
2068                 object in the chain. It can be the one it received after
2069                 changing some fields.
2070
2071                 In addition, this method may raise a PostProcessingError
2072                 exception that will be taken into account by the downloader
2073                 it was called from.
2074                 """
2075                 return information # by default, do nothing
2076         
2077 ### MAIN PROGRAM ###
2078 if __name__ == '__main__':
2079         try:
2080                 # Modules needed only when running the main program
2081                 import getpass
2082                 import optparse
2083
2084                 # Function to update the program file with the latest version from bitbucket.org
2085                 def update_self(downloader, filename):
2086                         # Note: downloader only used for options
2087                         if not os.access (filename, os.W_OK):
2088                                 sys.exit('ERROR: no write permissions on %s' % filename)
2089
2090                         downloader.to_stdout('Updating to latest stable version...')
2091                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2092                         latest_version = urllib.urlopen(latest_url).read().strip()
2093                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2094                         newcontent = urllib.urlopen(prog_url).read()
2095                         stream = open(filename, 'w')
2096                         stream.write(newcontent)
2097                         stream.close()
2098                         downloader.to_stdout('Updated to version %s' % latest_version)
2099
2100                 # General configuration
2101                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2102                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2103                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2104
2105                 # Parse command line
2106                 parser = optparse.OptionParser(
2107                         usage='Usage: %prog [options] url...',
2108                         version='2010.06.06',
2109                         conflict_handler='resolve',
2110                 )
2111
2112                 parser.add_option('-h', '--help',
2113                                 action='help', help='print this help text and exit')
2114                 parser.add_option('-v', '--version',
2115                                 action='version', help='print program version and exit')
2116                 parser.add_option('-U', '--update',
2117                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2118                 parser.add_option('-i', '--ignore-errors',
2119                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2120                 parser.add_option('-r', '--rate-limit',
2121                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2122                 parser.add_option('-R', '--retries',
2123                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2124
2125                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2126                 authentication.add_option('-u', '--username',
2127                                 dest='username', metavar='USERNAME', help='account username')
2128                 authentication.add_option('-p', '--password',
2129                                 dest='password', metavar='PASSWORD', help='account password')
2130                 authentication.add_option('-n', '--netrc',
2131                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2132                 parser.add_option_group(authentication)
2133
2134                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2135                 video_format.add_option('-f', '--format',
2136                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2137                 video_format.add_option('-b', '--best-quality',
2138                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
2139                 video_format.add_option('-m', '--mobile-version',
2140                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2141                 video_format.add_option('-d', '--high-def',
2142                                 action='store_const', dest='format', help='alias for -f 22', const='22')
2143                 video_format.add_option('--all-formats',
2144                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2145                 parser.add_option_group(video_format)
2146
2147                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2148                 verbosity.add_option('-q', '--quiet',
2149                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2150                 verbosity.add_option('-s', '--simulate',
2151                                 action='store_true', dest='simulate', help='do not download video', default=False)
2152                 verbosity.add_option('-g', '--get-url',
2153                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2154                 verbosity.add_option('-e', '--get-title',
2155                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2156                 verbosity.add_option('--get-thumbnail',
2157                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2158                 verbosity.add_option('--get-description',
2159                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2160                 verbosity.add_option('--no-progress',
2161                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2162                 parser.add_option_group(verbosity)
2163
2164                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2165                 filesystem.add_option('-t', '--title',
2166                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2167                 filesystem.add_option('-l', '--literal',
2168                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2169                 filesystem.add_option('-o', '--output',
2170                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2171                 filesystem.add_option('-a', '--batch-file',
2172                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2173                 filesystem.add_option('-w', '--no-overwrites',
2174                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2175                 filesystem.add_option('-c', '--continue',
2176                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2177                 parser.add_option_group(filesystem)
2178
2179                 (opts, args) = parser.parse_args()
2180
2181                 # Batch file verification
2182                 batchurls = []
2183                 if opts.batchfile is not None:
2184                         try:
2185                                 if opts.batchfile == '-':
2186                                         batchfd = sys.stdin
2187                                 else:
2188                                         batchfd = open(opts.batchfile, 'r')
2189                                 batchurls = batchfd.readlines()
2190                                 batchurls = [x.strip() for x in batchurls]
2191                                 batchurls = [x for x in batchurls if len(x) > 0]
2192                         except IOError:
2193                                 sys.exit(u'ERROR: batch file could not be read')
2194                 all_urls = batchurls + args
2195
2196                 # Conflicting, missing and erroneous options
2197                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2198                         parser.error(u'using .netrc conflicts with giving username/password')
2199                 if opts.password is not None and opts.username is None:
2200                         parser.error(u'account username missing')
2201                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2202                         parser.error(u'using output template conflicts with using title or literal title')
2203                 if opts.usetitle and opts.useliteral:
2204                         parser.error(u'using title conflicts with using literal title')
2205                 if opts.username is not None and opts.password is None:
2206                         opts.password = getpass.getpass(u'Type account password and press return:')
2207                 if opts.ratelimit is not None:
2208                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2209                         if numeric_limit is None:
2210                                 parser.error(u'invalid rate limit specified')
2211                         opts.ratelimit = numeric_limit
2212                 if opts.retries is not None:
2213                         try:
2214                                 opts.retries = long(opts.retries)
2215                         except (TypeError, ValueError), err:
2216                                 parser.error(u'invalid retry count specified')
2217
2218                 # Information extractors
2219                 youtube_ie = YoutubeIE()
2220                 metacafe_ie = MetacafeIE(youtube_ie)
2221                 dailymotion_ie = DailymotionIE()
2222                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2223                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2224                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2225                 google_ie = GoogleIE()
2226                 google_search_ie = GoogleSearchIE(google_ie)
2227                 photobucket_ie = PhotobucketIE()
2228                 yahoo_ie = YahooIE()
2229                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2230                 generic_ie = GenericIE()
2231
2232                 # File downloader
2233                 fd = FileDownloader({
2234                         'usenetrc': opts.usenetrc,
2235                         'username': opts.username,
2236                         'password': opts.password,
2237                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2238                         'forceurl': opts.geturl,
2239                         'forcetitle': opts.gettitle,
2240                         'forcethumbnail': opts.getthumbnail,
2241                         'forcedescription': opts.getdescription,
2242                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2243                         'format': opts.format,
2244                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2245                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2246                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2247                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2248                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2249                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2250                                 or u'%(id)s.%(ext)s'),
2251                         'ignoreerrors': opts.ignoreerrors,
2252                         'ratelimit': opts.ratelimit,
2253                         'nooverwrites': opts.nooverwrites,
2254                         'retries': opts.retries,
2255                         'continuedl': opts.continue_dl,
2256                         'noprogress': opts.noprogress,
2257                         })
2258                 fd.add_info_extractor(youtube_search_ie)
2259                 fd.add_info_extractor(youtube_pl_ie)
2260                 fd.add_info_extractor(youtube_user_ie)
2261                 fd.add_info_extractor(metacafe_ie)
2262                 fd.add_info_extractor(dailymotion_ie)
2263                 fd.add_info_extractor(youtube_ie)
2264                 fd.add_info_extractor(google_ie)
2265                 fd.add_info_extractor(google_search_ie)
2266                 fd.add_info_extractor(photobucket_ie)
2267                 fd.add_info_extractor(yahoo_ie)
2268                 fd.add_info_extractor(yahoo_search_ie)
2269
2270                 # This must come last since it's the
2271                 # fallback if none of the others work
2272                 fd.add_info_extractor(generic_ie)
2273
2274                 # Update version
2275                 if opts.update_self:
2276                         update_self(fd, sys.argv[0])
2277
2278                 # Maybe do nothing
2279                 if len(all_urls) < 1:
2280                         if not opts.update_self:
2281                                 parser.error(u'you must provide at least one URL')
2282                         else:
2283                                 sys.exit()
2284                 retcode = fd.download(all_urls)
2285                 sys.exit(retcode)
2286
2287         except DownloadError:
2288                 sys.exit(1)
2289         except SameFileError:
2290                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2291         except KeyboardInterrupt:
2292                 sys.exit(u'\nERROR: Interrupted by user')