Use HTTPS for the login URL (fixes issue #163)
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableVideoError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         format_limit:   Highest quality format to try.
193         outtmpl:        Template for output names.
194         ignoreerrors:   Do not stop on download errors.
195         ratelimit:      Download speed limit, in bytes/sec.
196         nooverwrites:   Prevent overwriting files.
197         retries:        Number of times to retry for HTTP error 5xx
198         continuedl:     Try to continue downloads if possible.
199         noprogress:     Do not print the progress bar.
200         """
201
202         params = None
203         _ies = []
204         _pps = []
205         _download_retcode = None
206         _num_downloads = None
207
208         def __init__(self, params):
209                 """Create a FileDownloader object with the given options."""
210                 self._ies = []
211                 self._pps = []
212                 self._download_retcode = 0
213                 self._num_downloads = 0
214                 self.params = params
215         
216         @staticmethod
217         def pmkdir(filename):
218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219                 components = filename.split(os.sep)
220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222                 for dir in aggregate:
223                         if not os.path.exists(dir):
224                                 os.mkdir(dir)
225         
226         @staticmethod
227         def format_bytes(bytes):
228                 if bytes is None:
229                         return 'N/A'
230                 if type(bytes) is str:
231                         bytes = float(bytes)
232                 if bytes == 0.0:
233                         exponent = 0
234                 else:
235                         exponent = long(math.log(bytes, 1024.0))
236                 suffix = 'bkMGTPEZY'[exponent]
237                 converted = float(bytes) / float(1024**exponent)
238                 return '%.2f%s' % (converted, suffix)
239
240         @staticmethod
241         def calc_percent(byte_counter, data_len):
242                 if data_len is None:
243                         return '---.-%'
244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246         @staticmethod
247         def calc_eta(start, now, total, current):
248                 if total is None:
249                         return '--:--'
250                 dif = now - start
251                 if current == 0 or dif < 0.001: # One millisecond
252                         return '--:--'
253                 rate = float(current) / dif
254                 eta = long((float(total) - float(current)) / rate)
255                 (eta_mins, eta_secs) = divmod(eta, 60)
256                 if eta_mins > 99:
257                         return '--:--'
258                 return '%02d:%02d' % (eta_mins, eta_secs)
259
260         @staticmethod
261         def calc_speed(start, now, bytes):
262                 dif = now - start
263                 if bytes == 0 or dif < 0.001: # One millisecond
264                         return '%10s' % '---b/s'
265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267         @staticmethod
268         def best_block_size(elapsed_time, bytes):
269                 new_min = max(bytes / 2.0, 1.0)
270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271                 if elapsed_time < 0.001:
272                         return long(new_max)
273                 rate = bytes / elapsed_time
274                 if rate > new_max:
275                         return long(new_max)
276                 if rate < new_min:
277                         return long(new_min)
278                 return long(rate)
279
280         @staticmethod
281         def parse_bytes(bytestr):
282                 """Parse a string indicating a byte quantity into a long integer."""
283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284                 if matchobj is None:
285                         return None
286                 number = float(matchobj.group(1))
287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288                 return long(round(number * multiplier))
289
290         def add_info_extractor(self, ie):
291                 """Add an InfoExtractor object to the end of the list."""
292                 self._ies.append(ie)
293                 ie.set_downloader(self)
294         
295         def add_post_processor(self, pp):
296                 """Add a PostProcessor object to the end of the chain."""
297                 self._pps.append(pp)
298                 pp.set_downloader(self)
299         
300         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
301                 """Print message to stdout if not in quiet mode."""
302                 try:
303                         if not self.params.get('quiet', False):
304                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
305                         sys.stdout.flush()
306                 except (UnicodeEncodeError), err:
307                         if not ignore_encoding_errors:
308                                 raise
309         
310         def to_stderr(self, message):
311                 """Print message to stderr."""
312                 print >>sys.stderr, message.encode(preferredencoding())
313         
314         def fixed_template(self):
315                 """Checks if the output template is fixed."""
316                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
317
318         def trouble(self, message=None):
319                 """Determine action to take when a download problem appears.
320
321                 Depending on if the downloader has been configured to ignore
322                 download errors or not, this method may throw an exception or
323                 not when errors are found, after printing the message.
324                 """
325                 if message is not None:
326                         self.to_stderr(message)
327                 if not self.params.get('ignoreerrors', False):
328                         raise DownloadError(message)
329                 self._download_retcode = 1
330
331         def slow_down(self, start_time, byte_counter):
332                 """Sleep if the download speed is over the rate limit."""
333                 rate_limit = self.params.get('ratelimit', None)
334                 if rate_limit is None or byte_counter == 0:
335                         return
336                 now = time.time()
337                 elapsed = now - start_time
338                 if elapsed <= 0.0:
339                         return
340                 speed = float(byte_counter) / elapsed
341                 if speed > rate_limit:
342                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
343
344         def report_destination(self, filename):
345                 """Report destination filename."""
346                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
347         
348         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
349                 """Report download progress."""
350                 if self.params.get('noprogress', False):
351                         return
352                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
353                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
354
355         def report_resuming_byte(self, resume_len):
356                 """Report attempt to resume at given byte."""
357                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
358         
359         def report_retry(self, count, retries):
360                 """Report retry in case of HTTP error 5xx"""
361                 self.to_stdout(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
362         
363         def report_file_already_downloaded(self, file_name):
364                 """Report file has already been fully downloaded."""
365                 try:
366                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367                 except (UnicodeEncodeError), err:
368                         self.to_stdout(u'[download] The file has already been downloaded')
369         
370         def report_unable_to_resume(self):
371                 """Report it was impossible to resume download."""
372                 self.to_stdout(u'[download] Unable to resume')
373         
374         def report_finish(self):
375                 """Report download finished."""
376                 if self.params.get('noprogress', False):
377                         self.to_stdout(u'[download] Download completed')
378                 else:
379                         self.to_stdout(u'')
380         
381         def increment_downloads(self):
382                 """Increment the ordinal that assigns a number to each file."""
383                 self._num_downloads += 1
384
385         def process_info(self, info_dict):
386                 """Process a single dictionary returned by an InfoExtractor."""
387                 # Do nothing else if in simulate mode
388                 if self.params.get('simulate', False):
389                         # Forced printings
390                         if self.params.get('forcetitle', False):
391                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392                         if self.params.get('forceurl', False):
393                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
395                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396                         if self.params.get('forcedescription', False) and 'description' in info_dict:
397                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
398
399                         return
400                         
401                 try:
402                         template_dict = dict(info_dict)
403                         template_dict['epoch'] = unicode(long(time.time()))
404                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
405                         filename = self.params['outtmpl'] % template_dict
406                 except (ValueError, KeyError), err:
407                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
408                         return
409                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
410                         self.to_stderr(u'WARNING: file exists and will be skipped')
411                         return
412
413                 try:
414                         self.pmkdir(filename)
415                 except (OSError, IOError), err:
416                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
417                         return
418
419                 try:
420                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
421                 except (OSError, IOError), err:
422                         raise UnavailableVideoError
423                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
424                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
425                         return
426                 except (ContentTooShortError, ), err:
427                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
428                         return
429
430                 if success:
431                         try:
432                                 self.post_process(filename, info_dict)
433                         except (PostProcessingError), err:
434                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
435                                 return
436
437         def download(self, url_list):
438                 """Download a given list of URLs."""
439                 if len(url_list) > 1 and self.fixed_template():
440                         raise SameFileError(self.params['outtmpl'])
441
442                 for url in url_list:
443                         suitable_found = False
444                         for ie in self._ies:
445                                 # Go to next InfoExtractor if not suitable
446                                 if not ie.suitable(url):
447                                         continue
448
449                                 # Suitable InfoExtractor found
450                                 suitable_found = True
451
452                                 # Extract information from URL and process it
453                                 ie.extract(url)
454
455                                 # Suitable InfoExtractor had been found; go to next URL
456                                 break
457
458                         if not suitable_found:
459                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
460
461                 return self._download_retcode
462
463         def post_process(self, filename, ie_info):
464                 """Run the postprocessing chain on the given file."""
465                 info = dict(ie_info)
466                 info['filepath'] = filename
467                 for pp in self._pps:
468                         info = pp.run(info)
469                         if info is None:
470                                 break
471         
472         def _download_with_rtmpdump(self, filename, url, player_url):
473                 self.report_destination(filename)
474
475                 # Check for rtmpdump first
476                 try:
477                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
478                 except (OSError, IOError):
479                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
480                         return False
481
482                 # Download using rtmpdump. rtmpdump returns exit code 2 when
483                 # the connection was interrumpted and resuming appears to be
484                 # possible. This is part of rtmpdump's normal usage, AFAIK.
485                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
486                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
487                 while retval == 2 or retval == 1:
488                         prevsize = os.path.getsize(filename)
489                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
490                         time.sleep(5.0) # This seems to be needed
491                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
492                         cursize = os.path.getsize(filename)
493                         if prevsize == cursize and retval == 1:
494                                 break
495                 if retval == 0:
496                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
497                         return True
498                 else:
499                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
500                         return False
501
502         def _do_download(self, filename, url, player_url):
503                 # Attempt to download using rtmpdump
504                 if url.startswith('rtmp'):
505                         return self._download_with_rtmpdump(filename, url, player_url)
506
507                 stream = None
508                 open_mode = 'wb'
509                 basic_request = urllib2.Request(url, None, std_headers)
510                 request = urllib2.Request(url, None, std_headers)
511
512                 # Establish possible resume length
513                 if os.path.isfile(filename):
514                         resume_len = os.path.getsize(filename)
515                 else:
516                         resume_len = 0
517
518                 # Request parameters in case of being able to resume
519                 if self.params.get('continuedl', False) and resume_len != 0:
520                         self.report_resuming_byte(resume_len)
521                         request.add_header('Range','bytes=%d-' % resume_len)
522                         open_mode = 'ab'
523
524                 count = 0
525                 retries = self.params.get('retries', 0)
526                 while count <= retries:
527                         # Establish connection
528                         try:
529                                 data = urllib2.urlopen(request)
530                                 break
531                         except (urllib2.HTTPError, ), err:
532                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
533                                         # Unexpected HTTP error
534                                         raise
535                                 elif err.code == 416:
536                                         # Unable to resume (requested range not satisfiable)
537                                         try:
538                                                 # Open the connection again without the range header
539                                                 data = urllib2.urlopen(basic_request)
540                                                 content_length = data.info()['Content-Length']
541                                         except (urllib2.HTTPError, ), err:
542                                                 if err.code < 500 or err.code >= 600:
543                                                         raise
544                                         else:
545                                                 # Examine the reported length
546                                                 if (content_length is not None and
547                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
548                                                         # The file had already been fully downloaded.
549                                                         # Explanation to the above condition: in issue #175 it was revealed that
550                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
551                                                         # changing the file size slightly and causing problems for some users. So
552                                                         # I decided to implement a suggested change and consider the file
553                                                         # completely downloaded if the file size differs less than 100 bytes from
554                                                         # the one in the hard drive.
555                                                         self.report_file_already_downloaded(filename)
556                                                         return True
557                                                 else:
558                                                         # The length does not match, we start the download over
559                                                         self.report_unable_to_resume()
560                                                         open_mode = 'wb'
561                                                         break
562                         # Retry
563                         count += 1
564                         if count <= retries:
565                                 self.report_retry(count, retries)
566
567                 if count > retries:
568                         self.trouble(u'ERROR: giving up after %s retries' % retries)
569                         return False
570
571                 data_len = data.info().get('Content-length', None)
572                 data_len_str = self.format_bytes(data_len)
573                 byte_counter = 0
574                 block_size = 1024
575                 start = time.time()
576                 while True:
577                         # Download and write
578                         before = time.time()
579                         data_block = data.read(block_size)
580                         after = time.time()
581                         data_block_len = len(data_block)
582                         if data_block_len == 0:
583                                 break
584                         byte_counter += data_block_len
585
586                         # Open file just in time
587                         if stream is None:
588                                 try:
589                                         (stream, filename) = sanitize_open(filename, open_mode)
590                                         self.report_destination(filename)
591                                 except (OSError, IOError), err:
592                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
593                                         return False
594                         try:
595                                 stream.write(data_block)
596                         except (IOError, OSError), err:
597                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
598                                 return False
599                         block_size = self.best_block_size(after - before, data_block_len)
600
601                         # Progress message
602                         percent_str = self.calc_percent(byte_counter, data_len)
603                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
604                         speed_str = self.calc_speed(start, time.time(), byte_counter)
605                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
606
607                         # Apply rate limit
608                         self.slow_down(start, byte_counter)
609
610                 self.report_finish()
611                 if data_len is not None and str(byte_counter) != data_len:
612                         raise ContentTooShortError(byte_counter, long(data_len))
613                 return True
614
615 class InfoExtractor(object):
616         """Information Extractor class.
617
618         Information extractors are the classes that, given a URL, extract
619         information from the video (or videos) the URL refers to. This
620         information includes the real video URL, the video title and simplified
621         title, author and others. The information is stored in a dictionary
622         which is then passed to the FileDownloader. The FileDownloader
623         processes this information possibly downloading the video to the file
624         system, among other possible outcomes. The dictionaries must include
625         the following fields:
626
627         id:             Video identifier.
628         url:            Final video URL.
629         uploader:       Nickname of the video uploader.
630         title:          Literal title.
631         stitle:         Simplified title.
632         ext:            Video filename extension.
633         format:         Video format.
634         player_url:     SWF Player URL (may be None).
635
636         The following fields are optional. Their primary purpose is to allow
637         youtube-dl to serve as the backend for a video search function, such
638         as the one in youtube2mp3.  They are only used when their respective
639         forced printing functions are called:
640
641         thumbnail:      Full URL to a video thumbnail image.
642         description:    One-line video description.
643
644         Subclasses of this one should re-define the _real_initialize() and
645         _real_extract() methods, as well as the suitable() static method.
646         Probably, they should also be instantiated and added to the main
647         downloader.
648         """
649
650         _ready = False
651         _downloader = None
652
653         def __init__(self, downloader=None):
654                 """Constructor. Receives an optional downloader."""
655                 self._ready = False
656                 self.set_downloader(downloader)
657
658         @staticmethod
659         def suitable(url):
660                 """Receives a URL and returns True if suitable for this IE."""
661                 return False
662
663         def initialize(self):
664                 """Initializes an instance (authentication, etc)."""
665                 if not self._ready:
666                         self._real_initialize()
667                         self._ready = True
668
669         def extract(self, url):
670                 """Extracts URL information and returns it in list of dicts."""
671                 self.initialize()
672                 return self._real_extract(url)
673
674         def set_downloader(self, downloader):
675                 """Sets the downloader for this IE."""
676                 self._downloader = downloader
677         
678         def _real_initialize(self):
679                 """Real initialization process. Redefine in subclasses."""
680                 pass
681
682         def _real_extract(self, url):
683                 """Real extraction process. Redefine in subclasses."""
684                 pass
685
686 class YoutubeIE(InfoExtractor):
687         """Information extractor for youtube.com."""
688
689         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
690         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
691         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
692         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
693         _NETRC_MACHINE = 'youtube'
694         # Listed in order of quality
695         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
696         _video_extensions = {
697                 '13': '3gp',
698                 '17': 'mp4',
699                 '18': 'mp4',
700                 '22': 'mp4',
701                 '37': 'mp4',
702                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
703                 '43': 'webm',
704                 '45': 'webm',
705         }
706
707         @staticmethod
708         def suitable(url):
709                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
710
711         def report_lang(self):
712                 """Report attempt to set language."""
713                 self._downloader.to_stdout(u'[youtube] Setting language')
714
715         def report_login(self):
716                 """Report attempt to log in."""
717                 self._downloader.to_stdout(u'[youtube] Logging in')
718         
719         def report_age_confirmation(self):
720                 """Report attempt to confirm age."""
721                 self._downloader.to_stdout(u'[youtube] Confirming age')
722         
723         def report_video_webpage_download(self, video_id):
724                 """Report attempt to download video webpage."""
725                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
726         
727         def report_video_info_webpage_download(self, video_id):
728                 """Report attempt to download video info webpage."""
729                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
730         
731         def report_information_extraction(self, video_id):
732                 """Report attempt to extract video information."""
733                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
734         
735         def report_unavailable_format(self, video_id, format):
736                 """Report extracted video URL."""
737                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
738         
739         def report_rtmp_download(self):
740                 """Indicate the download will use the RTMP protocol."""
741                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
742         
743         def _real_initialize(self):
744                 if self._downloader is None:
745                         return
746
747                 username = None
748                 password = None
749                 downloader_params = self._downloader.params
750
751                 # Attempt to use provided username and password or .netrc data
752                 if downloader_params.get('username', None) is not None:
753                         username = downloader_params['username']
754                         password = downloader_params['password']
755                 elif downloader_params.get('usenetrc', False):
756                         try:
757                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
758                                 if info is not None:
759                                         username = info[0]
760                                         password = info[2]
761                                 else:
762                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
763                         except (IOError, netrc.NetrcParseError), err:
764                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
765                                 return
766
767                 # Set language
768                 request = urllib2.Request(self._LANG_URL, None, std_headers)
769                 try:
770                         self.report_lang()
771                         urllib2.urlopen(request).read()
772                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
773                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
774                         return
775
776                 # No authentication to be performed
777                 if username is None:
778                         return
779
780                 # Log in
781                 login_form = {
782                                 'current_form': 'loginForm',
783                                 'next':         '/',
784                                 'action_login': 'Log In',
785                                 'username':     username,
786                                 'password':     password,
787                                 }
788                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
789                 try:
790                         self.report_login()
791                         login_results = urllib2.urlopen(request).read()
792                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
793                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
794                                 return
795                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
796                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
797                         return
798         
799                 # Confirm age
800                 age_form = {
801                                 'next_url':             '/',
802                                 'action_confirm':       'Confirm',
803                                 }
804                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
805                 try:
806                         self.report_age_confirmation()
807                         age_results = urllib2.urlopen(request).read()
808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
809                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
810                         return
811
812         def _real_extract(self, url):
813                 # Extract video id from URL
814                 mobj = re.match(self._VALID_URL, url)
815                 if mobj is None:
816                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
817                         return
818                 video_id = mobj.group(2)
819
820                 # Get video webpage
821                 self.report_video_webpage_download(video_id)
822                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
823                 try:
824                         video_webpage = urllib2.urlopen(request).read()
825                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
826                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
827                         return
828
829                 # Attempt to extract SWF player URL
830                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
831                 if mobj is not None:
832                         player_url = mobj.group(1)
833                 else:
834                         player_url = None
835
836                 # Get video info
837                 self.report_video_info_webpage_download(video_id)
838                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
839                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
840                                            % (video_id, el_type))
841                         request = urllib2.Request(video_info_url, None, std_headers)
842                         try:
843                                 video_info_webpage = urllib2.urlopen(request).read()
844                                 video_info = parse_qs(video_info_webpage)
845                                 if 'token' in video_info:
846                                         break
847                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
848                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
849                                 return
850                 if 'token' not in video_info:
851                         if 'reason' in video_info:
852                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
853                         else:
854                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
855                         return
856
857                 # Start extracting information
858                 self.report_information_extraction(video_id)
859
860                 # uploader
861                 if 'author' not in video_info:
862                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
863                         return
864                 video_uploader = urllib.unquote_plus(video_info['author'][0])
865
866                 # title
867                 if 'title' not in video_info:
868                         self._downloader.trouble(u'ERROR: unable to extract video title')
869                         return
870                 video_title = urllib.unquote_plus(video_info['title'][0])
871                 video_title = video_title.decode('utf-8')
872                 video_title = sanitize_title(video_title)
873
874                 # simplified title
875                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
876                 simple_title = simple_title.strip(ur'_')
877
878                 # thumbnail image
879                 if 'thumbnail_url' not in video_info:
880                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
881                         video_thumbnail = ''
882                 else:   # don't panic if we can't find it
883                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
884
885                 # description
886                 video_description = 'No description available.'
887                 if self._downloader.params.get('forcedescription', False):
888                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
889                         if mobj is not None:
890                                 video_description = mobj.group(1)
891
892                 # token
893                 video_token = urllib.unquote_plus(video_info['token'][0])
894
895                 # Decide which formats to download
896                 requested_format = self._downloader.params.get('format', None)
897                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
898
899                 if 'fmt_url_map' in video_info:
900                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
901                         format_limit = self._downloader.params.get('format_limit', None)
902                         if format_limit is not None and format_limit in self._available_formats:
903                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
904                         else:
905                                 format_list = self._available_formats
906                         existing_formats = [x for x in format_list if x in url_map]
907                         if len(existing_formats) == 0:
908                                 self._downloader.trouble(u'ERROR: no known formats available for video')
909                                 return
910                         if requested_format is None:
911                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
912                         elif requested_format == '-1':
913                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
914                         else:
915                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
916
917                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
918                         self.report_rtmp_download()
919                         video_url_list = [(None, video_info['conn'][0])]
920
921                 else:
922                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
923                         return
924
925                 for format_param, video_real_url in video_url_list:
926                         # At this point we have a new video
927                         self._downloader.increment_downloads()
928
929                         # Extension
930                         video_extension = self._video_extensions.get(format_param, 'flv')
931
932                         # Find the video URL in fmt_url_map or conn paramters
933                         try:
934                                 # Process video information
935                                 self._downloader.process_info({
936                                         'id':           video_id.decode('utf-8'),
937                                         'url':          video_real_url.decode('utf-8'),
938                                         'uploader':     video_uploader.decode('utf-8'),
939                                         'title':        video_title,
940                                         'stitle':       simple_title,
941                                         'ext':          video_extension.decode('utf-8'),
942                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
943                                         'thumbnail':    video_thumbnail.decode('utf-8'),
944                                         'description':  video_description.decode('utf-8'),
945                                         'player_url':   player_url,
946                                 })
947                         except UnavailableVideoError, err:
948                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
949
950
951 class MetacafeIE(InfoExtractor):
952         """Information Extractor for metacafe.com."""
953
954         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
955         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
956         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
957         _youtube_ie = None
958
959         def __init__(self, youtube_ie, downloader=None):
960                 InfoExtractor.__init__(self, downloader)
961                 self._youtube_ie = youtube_ie
962
963         @staticmethod
964         def suitable(url):
965                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
966
967         def report_disclaimer(self):
968                 """Report disclaimer retrieval."""
969                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
970
971         def report_age_confirmation(self):
972                 """Report attempt to confirm age."""
973                 self._downloader.to_stdout(u'[metacafe] Confirming age')
974         
975         def report_download_webpage(self, video_id):
976                 """Report webpage download."""
977                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
978         
979         def report_extraction(self, video_id):
980                 """Report information extraction."""
981                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
982
983         def _real_initialize(self):
984                 # Retrieve disclaimer
985                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
986                 try:
987                         self.report_disclaimer()
988                         disclaimer = urllib2.urlopen(request).read()
989                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
990                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
991                         return
992
993                 # Confirm age
994                 disclaimer_form = {
995                         'filters': '0',
996                         'submit': "Continue - I'm over 18",
997                         }
998                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
999                 try:
1000                         self.report_age_confirmation()
1001                         disclaimer = urllib2.urlopen(request).read()
1002                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1003                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1004                         return
1005         
1006         def _real_extract(self, url):
1007                 # Extract id and simplified title from URL
1008                 mobj = re.match(self._VALID_URL, url)
1009                 if mobj is None:
1010                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1011                         return
1012
1013                 video_id = mobj.group(1)
1014
1015                 # Check if video comes from YouTube
1016                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1017                 if mobj2 is not None:
1018                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1019                         return
1020
1021                 # At this point we have a new video
1022                 self._downloader.increment_downloads()
1023
1024                 simple_title = mobj.group(2).decode('utf-8')
1025
1026                 # Retrieve video webpage to extract further information
1027                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1028                 try:
1029                         self.report_download_webpage(video_id)
1030                         webpage = urllib2.urlopen(request).read()
1031                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1032                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1033                         return
1034
1035                 # Extract URL, uploader and title from webpage
1036                 self.report_extraction(video_id)
1037                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1038                 if mobj is not None:
1039                         mediaURL = urllib.unquote(mobj.group(1))
1040                         video_extension = mediaURL[-3:]
1041                         
1042                         # Extract gdaKey if available
1043                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1044                         if mobj is None:
1045                                 video_url = mediaURL
1046                         else:
1047                                 gdaKey = mobj.group(1)
1048                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1049                 else:
1050                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1051                         if mobj is None:
1052                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1053                                 return
1054                         vardict = parse_qs(mobj.group(1))
1055                         if 'mediaData' not in vardict:
1056                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1057                                 return
1058                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1059                         if mobj is None:
1060                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1061                                 return
1062                         mediaURL = mobj.group(1).replace('\\/', '/')
1063                         video_extension = mediaURL[-3:]
1064                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1065
1066                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1067                 if mobj is None:
1068                         self._downloader.trouble(u'ERROR: unable to extract title')
1069                         return
1070                 video_title = mobj.group(1).decode('utf-8')
1071                 video_title = sanitize_title(video_title)
1072
1073                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1074                 if mobj is None:
1075                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1076                         return
1077                 video_uploader = mobj.group(1)
1078
1079                 try:
1080                         # Process video information
1081                         self._downloader.process_info({
1082                                 'id':           video_id.decode('utf-8'),
1083                                 'url':          video_url.decode('utf-8'),
1084                                 'uploader':     video_uploader.decode('utf-8'),
1085                                 'title':        video_title,
1086                                 'stitle':       simple_title,
1087                                 'ext':          video_extension.decode('utf-8'),
1088                                 'format':       u'NA',
1089                                 'player_url':   None,
1090                         })
1091                 except UnavailableVideoError:
1092                         self._downloader.trouble(u'ERROR: unable to download video')
1093
1094
1095 class DailymotionIE(InfoExtractor):
1096         """Information Extractor for Dailymotion"""
1097
1098         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1099
1100         def __init__(self, downloader=None):
1101                 InfoExtractor.__init__(self, downloader)
1102
1103         @staticmethod
1104         def suitable(url):
1105                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1106
1107         def report_download_webpage(self, video_id):
1108                 """Report webpage download."""
1109                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1110         
1111         def report_extraction(self, video_id):
1112                 """Report information extraction."""
1113                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1114
1115         def _real_initialize(self):
1116                 return
1117
1118         def _real_extract(self, url):
1119                 # Extract id and simplified title from URL
1120                 mobj = re.match(self._VALID_URL, url)
1121                 if mobj is None:
1122                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1123                         return
1124
1125                 # At this point we have a new video
1126                 self._downloader.increment_downloads()
1127                 video_id = mobj.group(1)
1128
1129                 simple_title = mobj.group(2).decode('utf-8')
1130                 video_extension = 'flv'
1131
1132                 # Retrieve video webpage to extract further information
1133                 request = urllib2.Request(url)
1134                 try:
1135                         self.report_download_webpage(video_id)
1136                         webpage = urllib2.urlopen(request).read()
1137                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1139                         return
1140
1141                 # Extract URL, uploader and title from webpage
1142                 self.report_extraction(video_id)
1143                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1144                 if mobj is None:
1145                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1146                         return
1147                 mediaURL = urllib.unquote(mobj.group(1))
1148
1149                 # if needed add http://www.dailymotion.com/ if relative URL
1150
1151                 video_url = mediaURL
1152
1153                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1154                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1155                 if mobj is None:
1156                         self._downloader.trouble(u'ERROR: unable to extract title')
1157                         return
1158                 video_title = mobj.group(1).decode('utf-8')
1159                 video_title = sanitize_title(video_title)
1160
1161                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1162                 if mobj is None:
1163                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1164                         return
1165                 video_uploader = mobj.group(1)
1166
1167                 try:
1168                         # Process video information
1169                         self._downloader.process_info({
1170                                 'id':           video_id.decode('utf-8'),
1171                                 'url':          video_url.decode('utf-8'),
1172                                 'uploader':     video_uploader.decode('utf-8'),
1173                                 'title':        video_title,
1174                                 'stitle':       simple_title,
1175                                 'ext':          video_extension.decode('utf-8'),
1176                                 'format':       u'NA',
1177                                 'player_url':   None,
1178                         })
1179                 except UnavailableVideoError:
1180                         self._downloader.trouble(u'ERROR: unable to download video')
1181
1182 class GoogleIE(InfoExtractor):
1183         """Information extractor for video.google.com."""
1184
1185         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1186
1187         def __init__(self, downloader=None):
1188                 InfoExtractor.__init__(self, downloader)
1189
1190         @staticmethod
1191         def suitable(url):
1192                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1193
1194         def report_download_webpage(self, video_id):
1195                 """Report webpage download."""
1196                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1197
1198         def report_extraction(self, video_id):
1199                 """Report information extraction."""
1200                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1201
1202         def _real_initialize(self):
1203                 return
1204
1205         def _real_extract(self, url):
1206                 # Extract id from URL
1207                 mobj = re.match(self._VALID_URL, url)
1208                 if mobj is None:
1209                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210                         return
1211
1212                 # At this point we have a new video
1213                 self._downloader.increment_downloads()
1214                 video_id = mobj.group(1)
1215
1216                 video_extension = 'mp4'
1217
1218                 # Retrieve video webpage to extract further information
1219                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1220                 try:
1221                         self.report_download_webpage(video_id)
1222                         webpage = urllib2.urlopen(request).read()
1223                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1224                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1225                         return
1226
1227                 # Extract URL, uploader, and title from webpage
1228                 self.report_extraction(video_id)
1229                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1230                 if mobj is None:
1231                         video_extension = 'flv'
1232                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1233                 if mobj is None:
1234                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1235                         return
1236                 mediaURL = urllib.unquote(mobj.group(1))
1237                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1238                 mediaURL = mediaURL.replace('\\x26', '\x26')
1239
1240                 video_url = mediaURL
1241
1242                 mobj = re.search(r'<title>(.*)</title>', webpage)
1243                 if mobj is None:
1244                         self._downloader.trouble(u'ERROR: unable to extract title')
1245                         return
1246                 video_title = mobj.group(1).decode('utf-8')
1247                 video_title = sanitize_title(video_title)
1248                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1249
1250                 # Extract video description
1251                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1252                 if mobj is None:
1253                         self._downloader.trouble(u'ERROR: unable to extract video description')
1254                         return
1255                 video_description = mobj.group(1).decode('utf-8')
1256                 if not video_description:
1257                         video_description = 'No description available.'
1258
1259                 # Extract video thumbnail
1260                 if self._downloader.params.get('forcethumbnail', False):
1261                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1262                         try:
1263                                 webpage = urllib2.urlopen(request).read()
1264                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1265                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1266                                 return
1267                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1268                         if mobj is None:
1269                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1270                                 return
1271                         video_thumbnail = mobj.group(1)
1272                 else:   # we need something to pass to process_info
1273                         video_thumbnail = ''
1274
1275
1276                 try:
1277                         # Process video information
1278                         self._downloader.process_info({
1279                                 'id':           video_id.decode('utf-8'),
1280                                 'url':          video_url.decode('utf-8'),
1281                                 'uploader':     u'NA',
1282                                 'title':        video_title,
1283                                 'stitle':       simple_title,
1284                                 'ext':          video_extension.decode('utf-8'),
1285                                 'format':       u'NA',
1286                                 'player_url':   None,
1287                         })
1288                 except UnavailableVideoError:
1289                         self._downloader.trouble(u'ERROR: unable to download video')
1290
1291
1292 class PhotobucketIE(InfoExtractor):
1293         """Information extractor for photobucket.com."""
1294
1295         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1296
1297         def __init__(self, downloader=None):
1298                 InfoExtractor.__init__(self, downloader)
1299
1300         @staticmethod
1301         def suitable(url):
1302                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1303
1304         def report_download_webpage(self, video_id):
1305                 """Report webpage download."""
1306                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1307
1308         def report_extraction(self, video_id):
1309                 """Report information extraction."""
1310                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1311
1312         def _real_initialize(self):
1313                 return
1314
1315         def _real_extract(self, url):
1316                 # Extract id from URL
1317                 mobj = re.match(self._VALID_URL, url)
1318                 if mobj is None:
1319                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320                         return
1321
1322                 # At this point we have a new video
1323                 self._downloader.increment_downloads()
1324                 video_id = mobj.group(1)
1325
1326                 video_extension = 'flv'
1327
1328                 # Retrieve video webpage to extract further information
1329                 request = urllib2.Request(url)
1330                 try:
1331                         self.report_download_webpage(video_id)
1332                         webpage = urllib2.urlopen(request).read()
1333                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1334                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1335                         return
1336
1337                 # Extract URL, uploader, and title from webpage
1338                 self.report_extraction(video_id)
1339                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1340                 if mobj is None:
1341                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1342                         return
1343                 mediaURL = urllib.unquote(mobj.group(1))
1344
1345                 video_url = mediaURL
1346
1347                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1348                 if mobj is None:
1349                         self._downloader.trouble(u'ERROR: unable to extract title')
1350                         return
1351                 video_title = mobj.group(1).decode('utf-8')
1352                 video_title = sanitize_title(video_title)
1353                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1354
1355                 video_uploader = mobj.group(2).decode('utf-8')
1356
1357                 try:
1358                         # Process video information
1359                         self._downloader.process_info({
1360                                 'id':           video_id.decode('utf-8'),
1361                                 'url':          video_url.decode('utf-8'),
1362                                 'uploader':     video_uploader,
1363                                 'title':        video_title,
1364                                 'stitle':       simple_title,
1365                                 'ext':          video_extension.decode('utf-8'),
1366                                 'format':       u'NA',
1367                                 'player_url':   None,
1368                         })
1369                 except UnavailableVideoError:
1370                         self._downloader.trouble(u'ERROR: unable to download video')
1371
1372
1373 class YahooIE(InfoExtractor):
1374         """Information extractor for video.yahoo.com."""
1375
1376         # _VALID_URL matches all Yahoo! Video URLs
1377         # _VPAGE_URL matches only the extractable '/watch/' URLs
1378         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1379         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1380
1381         def __init__(self, downloader=None):
1382                 InfoExtractor.__init__(self, downloader)
1383
1384         @staticmethod
1385         def suitable(url):
1386                 return (re.match(YahooIE._VALID_URL, url) is not None)
1387
1388         def report_download_webpage(self, video_id):
1389                 """Report webpage download."""
1390                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1391
1392         def report_extraction(self, video_id):
1393                 """Report information extraction."""
1394                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1395
1396         def _real_initialize(self):
1397                 return
1398
1399         def _real_extract(self, url, new_video=True):
1400                 # Extract ID from URL
1401                 mobj = re.match(self._VALID_URL, url)
1402                 if mobj is None:
1403                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1404                         return
1405
1406                 # At this point we have a new video
1407                 self._downloader.increment_downloads()
1408                 video_id = mobj.group(2)
1409                 video_extension = 'flv'
1410
1411                 # Rewrite valid but non-extractable URLs as
1412                 # extractable English language /watch/ URLs
1413                 if re.match(self._VPAGE_URL, url) is None:
1414                         request = urllib2.Request(url)
1415                         try:
1416                                 webpage = urllib2.urlopen(request).read()
1417                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1418                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1419                                 return
1420
1421                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1422                         if mobj is None:
1423                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1424                                 return
1425                         yahoo_id = mobj.group(1)
1426
1427                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1428                         if mobj is None:
1429                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1430                                 return
1431                         yahoo_vid = mobj.group(1)
1432
1433                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1434                         return self._real_extract(url, new_video=False)
1435
1436                 # Retrieve video webpage to extract further information
1437                 request = urllib2.Request(url)
1438                 try:
1439                         self.report_download_webpage(video_id)
1440                         webpage = urllib2.urlopen(request).read()
1441                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1443                         return
1444
1445                 # Extract uploader and title from webpage
1446                 self.report_extraction(video_id)
1447                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1448                 if mobj is None:
1449                         self._downloader.trouble(u'ERROR: unable to extract video title')
1450                         return
1451                 video_title = mobj.group(1).decode('utf-8')
1452                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1453
1454                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1455                 if mobj is None:
1456                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1457                         return
1458                 video_uploader = mobj.group(1).decode('utf-8')
1459
1460                 # Extract video thumbnail
1461                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1464                         return
1465                 video_thumbnail = mobj.group(1).decode('utf-8')
1466
1467                 # Extract video description
1468                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1469                 if mobj is None:
1470                         self._downloader.trouble(u'ERROR: unable to extract video description')
1471                         return
1472                 video_description = mobj.group(1).decode('utf-8')
1473                 if not video_description: video_description = 'No description available.'
1474
1475                 # Extract video height and width
1476                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1477                 if mobj is None:
1478                         self._downloader.trouble(u'ERROR: unable to extract video height')
1479                         return
1480                 yv_video_height = mobj.group(1)
1481
1482                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1483                 if mobj is None:
1484                         self._downloader.trouble(u'ERROR: unable to extract video width')
1485                         return
1486                 yv_video_width = mobj.group(1)
1487
1488                 # Retrieve video playlist to extract media URL
1489                 # I'm not completely sure what all these options are, but we
1490                 # seem to need most of them, otherwise the server sends a 401.
1491                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1492                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1493                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1494                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1495                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1496                 try:
1497                         self.report_download_webpage(video_id)
1498                         webpage = urllib2.urlopen(request).read()
1499                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1500                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1501                         return
1502
1503                 # Extract media URL from playlist XML
1504                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1505                 if mobj is None:
1506                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1507                         return
1508                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1509                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1510
1511                 try:
1512                         # Process video information
1513                         self._downloader.process_info({
1514                                 'id':           video_id.decode('utf-8'),
1515                                 'url':          video_url,
1516                                 'uploader':     video_uploader,
1517                                 'title':        video_title,
1518                                 'stitle':       simple_title,
1519                                 'ext':          video_extension.decode('utf-8'),
1520                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1521                                 'description':  video_description,
1522                                 'thumbnail':    video_thumbnail,
1523                                 'description':  video_description,
1524                                 'player_url':   None,
1525                         })
1526                 except UnavailableVideoError:
1527                         self._downloader.trouble(u'ERROR: unable to download video')
1528
1529
1530 class GenericIE(InfoExtractor):
1531         """Generic last-resort information extractor."""
1532
1533         def __init__(self, downloader=None):
1534                 InfoExtractor.__init__(self, downloader)
1535
1536         @staticmethod
1537         def suitable(url):
1538                 return True
1539
1540         def report_download_webpage(self, video_id):
1541                 """Report webpage download."""
1542                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1543                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1544
1545         def report_extraction(self, video_id):
1546                 """Report information extraction."""
1547                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1548
1549         def _real_initialize(self):
1550                 return
1551
1552         def _real_extract(self, url):
1553                 # At this point we have a new video
1554                 self._downloader.increment_downloads()
1555
1556                 video_id = url.split('/')[-1]
1557                 request = urllib2.Request(url)
1558                 try:
1559                         self.report_download_webpage(video_id)
1560                         webpage = urllib2.urlopen(request).read()
1561                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1562                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1563                         return
1564                 except ValueError, err:
1565                         # since this is the last-resort InfoExtractor, if
1566                         # this error is thrown, it'll be thrown here
1567                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1568                         return
1569
1570                 # Start with something easy: JW Player in SWFObject
1571                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1572                 if mobj is None:
1573                         # Broaden the search a little bit
1574                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1575                 if mobj is None:
1576                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1577                         return
1578
1579                 # It's possible that one of the regexes
1580                 # matched, but returned an empty group:
1581                 if mobj.group(1) is None:
1582                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1583                         return
1584
1585                 video_url = urllib.unquote(mobj.group(1))
1586                 video_id  = os.path.basename(video_url)
1587
1588                 # here's a fun little line of code for you:
1589                 video_extension = os.path.splitext(video_id)[1][1:]
1590                 video_id        = os.path.splitext(video_id)[0]
1591
1592                 # it's tempting to parse this further, but you would
1593                 # have to take into account all the variations like
1594                 #   Video Title - Site Name
1595                 #   Site Name | Video Title
1596                 #   Video Title - Tagline | Site Name
1597                 # and so on and so forth; it's just not practical
1598                 mobj = re.search(r'<title>(.*)</title>', webpage)
1599                 if mobj is None:
1600                         self._downloader.trouble(u'ERROR: unable to extract title')
1601                         return
1602                 video_title = mobj.group(1).decode('utf-8')
1603                 video_title = sanitize_title(video_title)
1604                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1605
1606                 # video uploader is domain name
1607                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: unable to extract title')
1610                         return
1611                 video_uploader = mobj.group(1).decode('utf-8')
1612
1613                 try:
1614                         # Process video information
1615                         self._downloader.process_info({
1616                                 'id':           video_id.decode('utf-8'),
1617                                 'url':          video_url.decode('utf-8'),
1618                                 'uploader':     video_uploader,
1619                                 'title':        video_title,
1620                                 'stitle':       simple_title,
1621                                 'ext':          video_extension.decode('utf-8'),
1622                                 'format':       u'NA',
1623                                 'player_url':   None,
1624                         })
1625                 except UnavailableVideoError, err:
1626                         self._downloader.trouble(u'ERROR: unable to download video')
1627
1628
1629 class YoutubeSearchIE(InfoExtractor):
1630         """Information Extractor for YouTube search queries."""
1631         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1632         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1633         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1634         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1635         _youtube_ie = None
1636         _max_youtube_results = 1000
1637
1638         def __init__(self, youtube_ie, downloader=None):
1639                 InfoExtractor.__init__(self, downloader)
1640                 self._youtube_ie = youtube_ie
1641         
1642         @staticmethod
1643         def suitable(url):
1644                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1645
1646         def report_download_page(self, query, pagenum):
1647                 """Report attempt to download playlist page with given number."""
1648                 query = query.decode(preferredencoding())
1649                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1650
1651         def _real_initialize(self):
1652                 self._youtube_ie.initialize()
1653         
1654         def _real_extract(self, query):
1655                 mobj = re.match(self._VALID_QUERY, query)
1656                 if mobj is None:
1657                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1658                         return
1659
1660                 prefix, query = query.split(':')
1661                 prefix = prefix[8:]
1662                 query  = query.encode('utf-8')
1663                 if prefix == '':
1664                         self._download_n_results(query, 1)
1665                         return
1666                 elif prefix == 'all':
1667                         self._download_n_results(query, self._max_youtube_results)
1668                         return
1669                 else:
1670                         try:
1671                                 n = long(prefix)
1672                                 if n <= 0:
1673                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1674                                         return
1675                                 elif n > self._max_youtube_results:
1676                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1677                                         n = self._max_youtube_results
1678                                 self._download_n_results(query, n)
1679                                 return
1680                         except ValueError: # parsing prefix as integer fails
1681                                 self._download_n_results(query, 1)
1682                                 return
1683
1684         def _download_n_results(self, query, n):
1685                 """Downloads a specified number of results for a query"""
1686
1687                 video_ids = []
1688                 already_seen = set()
1689                 pagenum = 1
1690
1691                 while True:
1692                         self.report_download_page(query, pagenum)
1693                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1694                         request = urllib2.Request(result_url, None, std_headers)
1695                         try:
1696                                 page = urllib2.urlopen(request).read()
1697                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1698                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1699                                 return
1700
1701                         # Extract video identifiers
1702                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1703                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1704                                 if video_id not in already_seen:
1705                                         video_ids.append(video_id)
1706                                         already_seen.add(video_id)
1707                                         if len(video_ids) == n:
1708                                                 # Specified n videos reached
1709                                                 for id in video_ids:
1710                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1711                                                 return
1712
1713                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1714                                 for id in video_ids:
1715                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1716                                 return
1717
1718                         pagenum = pagenum + 1
1719
1720 class GoogleSearchIE(InfoExtractor):
1721         """Information Extractor for Google Video search queries."""
1722         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1723         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1724         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1725         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1726         _google_ie = None
1727         _max_google_results = 1000
1728
1729         def __init__(self, google_ie, downloader=None):
1730                 InfoExtractor.__init__(self, downloader)
1731                 self._google_ie = google_ie
1732         
1733         @staticmethod
1734         def suitable(url):
1735                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1736
1737         def report_download_page(self, query, pagenum):
1738                 """Report attempt to download playlist page with given number."""
1739                 query = query.decode(preferredencoding())
1740                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1741
1742         def _real_initialize(self):
1743                 self._google_ie.initialize()
1744         
1745         def _real_extract(self, query):
1746                 mobj = re.match(self._VALID_QUERY, query)
1747                 if mobj is None:
1748                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1749                         return
1750
1751                 prefix, query = query.split(':')
1752                 prefix = prefix[8:]
1753                 query  = query.encode('utf-8')
1754                 if prefix == '':
1755                         self._download_n_results(query, 1)
1756                         return
1757                 elif prefix == 'all':
1758                         self._download_n_results(query, self._max_google_results)
1759                         return
1760                 else:
1761                         try:
1762                                 n = long(prefix)
1763                                 if n <= 0:
1764                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1765                                         return
1766                                 elif n > self._max_google_results:
1767                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1768                                         n = self._max_google_results
1769                                 self._download_n_results(query, n)
1770                                 return
1771                         except ValueError: # parsing prefix as integer fails
1772                                 self._download_n_results(query, 1)
1773                                 return
1774
1775         def _download_n_results(self, query, n):
1776                 """Downloads a specified number of results for a query"""
1777
1778                 video_ids = []
1779                 already_seen = set()
1780                 pagenum = 1
1781
1782                 while True:
1783                         self.report_download_page(query, pagenum)
1784                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1785                         request = urllib2.Request(result_url, None, std_headers)
1786                         try:
1787                                 page = urllib2.urlopen(request).read()
1788                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1789                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1790                                 return
1791
1792                         # Extract video identifiers
1793                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1794                                 video_id = mobj.group(1)
1795                                 if video_id not in already_seen:
1796                                         video_ids.append(video_id)
1797                                         already_seen.add(video_id)
1798                                         if len(video_ids) == n:
1799                                                 # Specified n videos reached
1800                                                 for id in video_ids:
1801                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1802                                                 return
1803
1804                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1805                                 for id in video_ids:
1806                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1807                                 return
1808
1809                         pagenum = pagenum + 1
1810
1811 class YahooSearchIE(InfoExtractor):
1812         """Information Extractor for Yahoo! Video search queries."""
1813         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1814         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1815         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1816         _MORE_PAGES_INDICATOR = r'\s*Next'
1817         _yahoo_ie = None
1818         _max_yahoo_results = 1000
1819
1820         def __init__(self, yahoo_ie, downloader=None):
1821                 InfoExtractor.__init__(self, downloader)
1822                 self._yahoo_ie = yahoo_ie
1823         
1824         @staticmethod
1825         def suitable(url):
1826                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1827
1828         def report_download_page(self, query, pagenum):
1829                 """Report attempt to download playlist page with given number."""
1830                 query = query.decode(preferredencoding())
1831                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1832
1833         def _real_initialize(self):
1834                 self._yahoo_ie.initialize()
1835         
1836         def _real_extract(self, query):
1837                 mobj = re.match(self._VALID_QUERY, query)
1838                 if mobj is None:
1839                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1840                         return
1841
1842                 prefix, query = query.split(':')
1843                 prefix = prefix[8:]
1844                 query  = query.encode('utf-8')
1845                 if prefix == '':
1846                         self._download_n_results(query, 1)
1847                         return
1848                 elif prefix == 'all':
1849                         self._download_n_results(query, self._max_yahoo_results)
1850                         return
1851                 else:
1852                         try:
1853                                 n = long(prefix)
1854                                 if n <= 0:
1855                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1856                                         return
1857                                 elif n > self._max_yahoo_results:
1858                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1859                                         n = self._max_yahoo_results
1860                                 self._download_n_results(query, n)
1861                                 return
1862                         except ValueError: # parsing prefix as integer fails
1863                                 self._download_n_results(query, 1)
1864                                 return
1865
1866         def _download_n_results(self, query, n):
1867                 """Downloads a specified number of results for a query"""
1868
1869                 video_ids = []
1870                 already_seen = set()
1871                 pagenum = 1
1872
1873                 while True:
1874                         self.report_download_page(query, pagenum)
1875                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1876                         request = urllib2.Request(result_url, None, std_headers)
1877                         try:
1878                                 page = urllib2.urlopen(request).read()
1879                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1880                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1881                                 return
1882
1883                         # Extract video identifiers
1884                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1885                                 video_id = mobj.group(1)
1886                                 if video_id not in already_seen:
1887                                         video_ids.append(video_id)
1888                                         already_seen.add(video_id)
1889                                         if len(video_ids) == n:
1890                                                 # Specified n videos reached
1891                                                 for id in video_ids:
1892                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1893                                                 return
1894
1895                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1896                                 for id in video_ids:
1897                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1898                                 return
1899
1900                         pagenum = pagenum + 1
1901
1902 class YoutubePlaylistIE(InfoExtractor):
1903         """Information Extractor for YouTube playlists."""
1904
1905         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1906         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1907         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1908         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1909         _youtube_ie = None
1910
1911         def __init__(self, youtube_ie, downloader=None):
1912                 InfoExtractor.__init__(self, downloader)
1913                 self._youtube_ie = youtube_ie
1914         
1915         @staticmethod
1916         def suitable(url):
1917                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1918
1919         def report_download_page(self, playlist_id, pagenum):
1920                 """Report attempt to download playlist page with given number."""
1921                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1922
1923         def _real_initialize(self):
1924                 self._youtube_ie.initialize()
1925         
1926         def _real_extract(self, url):
1927                 # Extract playlist id
1928                 mobj = re.match(self._VALID_URL, url)
1929                 if mobj is None:
1930                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1931                         return
1932
1933                 # Download playlist pages
1934                 playlist_id = mobj.group(1)
1935                 video_ids = []
1936                 pagenum = 1
1937
1938                 while True:
1939                         self.report_download_page(playlist_id, pagenum)
1940                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1941                         try:
1942                                 page = urllib2.urlopen(request).read()
1943                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1944                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1945                                 return
1946
1947                         # Extract video identifiers
1948                         ids_in_page = []
1949                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1950                                 if mobj.group(1) not in ids_in_page:
1951                                         ids_in_page.append(mobj.group(1))
1952                         video_ids.extend(ids_in_page)
1953
1954                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1955                                 break
1956                         pagenum = pagenum + 1
1957
1958                 playliststart = self._downloader.params.get('playliststart', 1)
1959                 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1960                 if playliststart > 0:
1961                         video_ids = video_ids[playliststart:]
1962                         
1963                 for id in video_ids:
1964                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1965                 return
1966
1967 class YoutubeUserIE(InfoExtractor):
1968         """Information Extractor for YouTube users."""
1969
1970         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1971         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1972         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1973         _youtube_ie = None
1974
1975         def __init__(self, youtube_ie, downloader=None):
1976                 InfoExtractor.__init__(self, downloader)
1977                 self._youtube_ie = youtube_ie
1978         
1979         @staticmethod
1980         def suitable(url):
1981                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1982
1983         def report_download_page(self, username):
1984                 """Report attempt to download user page."""
1985                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1986
1987         def _real_initialize(self):
1988                 self._youtube_ie.initialize()
1989         
1990         def _real_extract(self, url):
1991                 # Extract username
1992                 mobj = re.match(self._VALID_URL, url)
1993                 if mobj is None:
1994                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1995                         return
1996
1997                 # Download user page
1998                 username = mobj.group(1)
1999                 video_ids = []
2000                 pagenum = 1
2001
2002                 self.report_download_page(username)
2003                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2004                 try:
2005                         page = urllib2.urlopen(request).read()
2006                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2007                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2008                         return
2009
2010                 # Extract video identifiers
2011                 ids_in_page = []
2012
2013                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2014                         if mobj.group(1) not in ids_in_page:
2015                                 ids_in_page.append(mobj.group(1))
2016                 video_ids.extend(ids_in_page)
2017
2018                 playliststart = self._downloader.params.get('playliststart', 1)
2019                 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2020                 if playliststart > 0:
2021                         video_ids = video_ids[playliststart:]   
2022
2023                 for id in video_ids:
2024                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2025                 return
2026
2027 class PostProcessor(object):
2028         """Post Processor class.
2029
2030         PostProcessor objects can be added to downloaders with their
2031         add_post_processor() method. When the downloader has finished a
2032         successful download, it will take its internal chain of PostProcessors
2033         and start calling the run() method on each one of them, first with
2034         an initial argument and then with the returned value of the previous
2035         PostProcessor.
2036
2037         The chain will be stopped if one of them ever returns None or the end
2038         of the chain is reached.
2039
2040         PostProcessor objects follow a "mutual registration" process similar
2041         to InfoExtractor objects.
2042         """
2043
2044         _downloader = None
2045
2046         def __init__(self, downloader=None):
2047                 self._downloader = downloader
2048
2049         def set_downloader(self, downloader):
2050                 """Sets the downloader for this PP."""
2051                 self._downloader = downloader
2052         
2053         def run(self, information):
2054                 """Run the PostProcessor.
2055
2056                 The "information" argument is a dictionary like the ones
2057                 composed by InfoExtractors. The only difference is that this
2058                 one has an extra field called "filepath" that points to the
2059                 downloaded file.
2060
2061                 When this method returns None, the postprocessing chain is
2062                 stopped. However, this method may return an information
2063                 dictionary that will be passed to the next postprocessing
2064                 object in the chain. It can be the one it received after
2065                 changing some fields.
2066
2067                 In addition, this method may raise a PostProcessingError
2068                 exception that will be taken into account by the downloader
2069                 it was called from.
2070                 """
2071                 return information # by default, do nothing
2072         
2073 ### MAIN PROGRAM ###
2074 if __name__ == '__main__':
2075         try:
2076                 # Modules needed only when running the main program
2077                 import getpass
2078                 import optparse
2079
2080                 # Function to update the program file with the latest version from bitbucket.org
2081                 def update_self(downloader, filename):
2082                         # Note: downloader only used for options
2083                         if not os.access (filename, os.W_OK):
2084                                 sys.exit('ERROR: no write permissions on %s' % filename)
2085
2086                         downloader.to_stdout('Updating to latest stable version...')
2087                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2088                         latest_version = urllib.urlopen(latest_url).read().strip()
2089                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2090                         newcontent = urllib.urlopen(prog_url).read()
2091                         stream = open(filename, 'w')
2092                         stream.write(newcontent)
2093                         stream.close()
2094                         downloader.to_stdout('Updated to version %s' % latest_version)
2095
2096                 # General configuration
2097                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2098                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2099                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2100
2101                 # Parse command line
2102                 parser = optparse.OptionParser(
2103                         usage='Usage: %prog [options] url...',
2104                         version='2010.10.03',
2105                         conflict_handler='resolve',
2106                 )
2107
2108                 parser.add_option('-h', '--help',
2109                                 action='help', help='print this help text and exit')
2110                 parser.add_option('-v', '--version',
2111                                 action='version', help='print program version and exit')
2112                 parser.add_option('-U', '--update',
2113                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2114                 parser.add_option('-i', '--ignore-errors',
2115                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2116                 parser.add_option('-r', '--rate-limit',
2117                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2118                 parser.add_option('-R', '--retries',
2119                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2120                 parser.add_option('--playlist-start',
2121                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2122
2123                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2124                 authentication.add_option('-u', '--username',
2125                                 dest='username', metavar='USERNAME', help='account username')
2126                 authentication.add_option('-p', '--password',
2127                                 dest='password', metavar='PASSWORD', help='account password')
2128                 authentication.add_option('-n', '--netrc',
2129                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2130                 parser.add_option_group(authentication)
2131
2132                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2133                 video_format.add_option('-f', '--format',
2134                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2135                 video_format.add_option('-m', '--mobile-version',
2136                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2137                 video_format.add_option('--all-formats',
2138                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2139                 video_format.add_option('--max-quality',
2140                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2141                 video_format.add_option('-b', '--best-quality',
2142                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2143                 parser.add_option_group(video_format)
2144
2145                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2146                 verbosity.add_option('-q', '--quiet',
2147                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2148                 verbosity.add_option('-s', '--simulate',
2149                                 action='store_true', dest='simulate', help='do not download video', default=False)
2150                 verbosity.add_option('-g', '--get-url',
2151                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2152                 verbosity.add_option('-e', '--get-title',
2153                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2154                 verbosity.add_option('--get-thumbnail',
2155                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2156                 verbosity.add_option('--get-description',
2157                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2158                 verbosity.add_option('--no-progress',
2159                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2160                 parser.add_option_group(verbosity)
2161
2162                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2163                 filesystem.add_option('-t', '--title',
2164                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2165                 filesystem.add_option('-l', '--literal',
2166                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2167                 filesystem.add_option('-o', '--output',
2168                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2169                 filesystem.add_option('-a', '--batch-file',
2170                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2171                 filesystem.add_option('-w', '--no-overwrites',
2172                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2173                 filesystem.add_option('-c', '--continue',
2174                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2175                 parser.add_option_group(filesystem)
2176
2177                 (opts, args) = parser.parse_args()
2178
2179                 # Batch file verification
2180                 batchurls = []
2181                 if opts.batchfile is not None:
2182                         try:
2183                                 if opts.batchfile == '-':
2184                                         batchfd = sys.stdin
2185                                 else:
2186                                         batchfd = open(opts.batchfile, 'r')
2187                                 batchurls = batchfd.readlines()
2188                                 batchurls = [x.strip() for x in batchurls]
2189                                 batchurls = [x for x in batchurls if len(x) > 0]
2190                         except IOError:
2191                                 sys.exit(u'ERROR: batch file could not be read')
2192                 all_urls = batchurls + args
2193
2194                 # Conflicting, missing and erroneous options
2195                 if opts.bestquality:
2196                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2197                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2198                         parser.error(u'using .netrc conflicts with giving username/password')
2199                 if opts.password is not None and opts.username is None:
2200                         parser.error(u'account username missing')
2201                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2202                         parser.error(u'using output template conflicts with using title or literal title')
2203                 if opts.usetitle and opts.useliteral:
2204                         parser.error(u'using title conflicts with using literal title')
2205                 if opts.username is not None and opts.password is None:
2206                         opts.password = getpass.getpass(u'Type account password and press return:')
2207                 if opts.ratelimit is not None:
2208                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2209                         if numeric_limit is None:
2210                                 parser.error(u'invalid rate limit specified')
2211                         opts.ratelimit = numeric_limit
2212                 if opts.retries is not None:
2213                         try:
2214                                 opts.retries = long(opts.retries)
2215                         except (TypeError, ValueError), err:
2216                                 parser.error(u'invalid retry count specified')
2217                 if opts.playliststart is not None:
2218                         try:
2219                                 opts.playliststart = long(opts.playliststart)
2220                         except (TypeError, ValueError), err:
2221                                 parser.error(u'invalid playlist page specified')
2222
2223                 # Information extractors
2224                 youtube_ie = YoutubeIE()
2225                 metacafe_ie = MetacafeIE(youtube_ie)
2226                 dailymotion_ie = DailymotionIE()
2227                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2228                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2229                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2230                 google_ie = GoogleIE()
2231                 google_search_ie = GoogleSearchIE(google_ie)
2232                 photobucket_ie = PhotobucketIE()
2233                 yahoo_ie = YahooIE()
2234                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2235                 generic_ie = GenericIE()
2236
2237                 # File downloader
2238                 fd = FileDownloader({
2239                         'usenetrc': opts.usenetrc,
2240                         'username': opts.username,
2241                         'password': opts.password,
2242                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2243                         'forceurl': opts.geturl,
2244                         'forcetitle': opts.gettitle,
2245                         'forcethumbnail': opts.getthumbnail,
2246                         'forcedescription': opts.getdescription,
2247                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2248                         'format': opts.format,
2249                         'format_limit': opts.format_limit,
2250                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2251                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2252                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2253                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2254                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2255                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2256                                 or u'%(id)s.%(ext)s'),
2257                         'ignoreerrors': opts.ignoreerrors,
2258                         'ratelimit': opts.ratelimit,
2259                         'nooverwrites': opts.nooverwrites,
2260                         'retries': opts.retries,
2261                         'continuedl': opts.continue_dl,
2262                         'noprogress': opts.noprogress,
2263                         'playliststart': opts.playliststart,
2264                         })
2265                 fd.add_info_extractor(youtube_search_ie)
2266                 fd.add_info_extractor(youtube_pl_ie)
2267                 fd.add_info_extractor(youtube_user_ie)
2268                 fd.add_info_extractor(metacafe_ie)
2269                 fd.add_info_extractor(dailymotion_ie)
2270                 fd.add_info_extractor(youtube_ie)
2271                 fd.add_info_extractor(google_ie)
2272                 fd.add_info_extractor(google_search_ie)
2273                 fd.add_info_extractor(photobucket_ie)
2274                 fd.add_info_extractor(yahoo_ie)
2275                 fd.add_info_extractor(yahoo_search_ie)
2276
2277                 # This must come last since it's the
2278                 # fallback if none of the others work
2279                 fd.add_info_extractor(generic_ie)
2280
2281                 # Update version
2282                 if opts.update_self:
2283                         update_self(fd, sys.argv[0])
2284
2285                 # Maybe do nothing
2286                 if len(all_urls) < 1:
2287                         if not opts.update_self:
2288                                 parser.error(u'you must provide at least one URL')
2289                         else:
2290                                 sys.exit()
2291                 retcode = fd.download(all_urls)
2292                 sys.exit(retcode)
2293
2294         except DownloadError:
2295                 sys.exit(1)
2296         except SameFileError:
2297                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2298         except KeyboardInterrupt:
2299                 sys.exit(u'\nERROR: Interrupted by user')