_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         retries:        Number of times to retry for HTTP error 503
 197         continuedl:     Try to continue downloads if possible.
 198         noprogress:     Do not print the progress bar.
 199         """
 200
 201         params = None
 202         _ies = []
 203         _pps = []
 204         _download_retcode = None
 205         _num_downloads = None
 206
 207         def __init__(self, params):
 208                 """Create a FileDownloader object with the given options."""
 209                 self._ies = []
 210                 self._pps = []
 211                 self._download_retcode = 0
 212                 self._num_downloads = 0
 213                 self.params = params
 214
 215         @staticmethod
 216         def pmkdir(filename):
 217                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 218                 components = filename.split(os.sep)
 219                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 220                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 221                 for dir in aggregate:
 222                         if not os.path.exists(dir):
 223                                 os.mkdir(dir)
 224
 225         @staticmethod
 226         def format_bytes(bytes):
 227                 if bytes is None:
 228                         return 'N/A'
 229                 if type(bytes) is str:
 230                         bytes = float(bytes)
 231                 if bytes == 0.0:
 232                         exponent = 0
 233                 else:
 234                         exponent = long(math.log(bytes, 1024.0))
 235                 suffix = 'bkMGTPEZY'[exponent]
 236                 converted = float(bytes) / float(1024**exponent)
 237                 return '%.2f%s' % (converted, suffix)
 238
 239         @staticmethod
 240         def calc_percent(byte_counter, data_len):
 241                 if data_len is None:
 242                         return '---.-%'
 243                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 244
 245         @staticmethod
 246         def calc_eta(start, now, total, current):
 247                 if total is None:
 248                         return '--:--'
 249                 dif = now - start
 250                 if current == 0 or dif < 0.001: # One millisecond
 251                         return '--:--'
 252                 rate = float(current) / dif
 253                 eta = long((float(total) - float(current)) / rate)
 254                 (eta_mins, eta_secs) = divmod(eta, 60)
 255                 if eta_mins > 99:
 256                         return '--:--'
 257                 return '%02d:%02d' % (eta_mins, eta_secs)
 258
 259         @staticmethod
 260         def calc_speed(start, now, bytes):
 261                 dif = now - start
 262                 if bytes == 0 or dif < 0.001: # One millisecond
 263                         return '%10s' % '---b/s'
 264                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 265
 266         @staticmethod
 267         def best_block_size(elapsed_time, bytes):
 268                 new_min = max(bytes / 2.0, 1.0)
 269                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 270                 if elapsed_time < 0.001:
 271                         return long(new_max)
 272                 rate = bytes / elapsed_time
 273                 if rate > new_max:
 274                         return long(new_max)
 275                 if rate < new_min:
 276                         return long(new_min)
 277                 return long(rate)
 278
 279         @staticmethod
 280         def parse_bytes(bytestr):
 281                 """Parse a string indicating a byte quantity into a long integer."""
 282                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 283                 if matchobj is None:
 284                         return None
 285                 number = float(matchobj.group(1))
 286                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 287                 return long(round(number * multiplier))
 288
 289         @staticmethod
 290         def verify_url(url):
 291                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 292                 request = urllib2.Request(url, None, std_headers)
 293                 data = urllib2.urlopen(request)
 294                 data.read(1)
 295                 url = data.geturl()
 296                 data.close()
 297                 return url
 298
 299         def add_info_extractor(self, ie):
 300                 """Add an InfoExtractor object to the end of the list."""
 301                 self._ies.append(ie)
 302                 ie.set_downloader(self)
 303
 304         def add_post_processor(self, pp):
 305                 """Add a PostProcessor object to the end of the chain."""
 306                 self._pps.append(pp)
 307                 pp.set_downloader(self)
 308
 309         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 310                 """Print message to stdout if not in quiet mode."""
 311                 try:
 312                         if not self.params.get('quiet', False):
 313                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 314                         sys.stdout.flush()
 315                 except (UnicodeEncodeError), err:
 316                         if not ignore_encoding_errors:
 317                                 raise
 318
 319         def to_stderr(self, message):
 320                 """Print message to stderr."""
 321                 print >>sys.stderr, message.encode(preferredencoding())
 322
 323         def fixed_template(self):
 324                 """Checks if the output template is fixed."""
 325                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 326
 327         def trouble(self, message=None):
 328                 """Determine action to take when a download problem appears.
 329
 330                 Depending on if the downloader has been configured to ignore
 331                 download errors or not, this method may throw an exception or
 332                 not when errors are found, after printing the message.
 333                 """
 334                 if message is not None:
 335                         self.to_stderr(message)
 336                 if not self.params.get('ignoreerrors', False):
 337                         raise DownloadError(message)
 338                 self._download_retcode = 1
 339
 340         def slow_down(self, start_time, byte_counter):
 341                 """Sleep if the download speed is over the rate limit."""
 342                 rate_limit = self.params.get('ratelimit', None)
 343                 if rate_limit is None or byte_counter == 0:
 344                         return
 345                 now = time.time()
 346                 elapsed = now - start_time
 347                 if elapsed <= 0.0:
 348                         return
 349                 speed = float(byte_counter) / elapsed
 350                 if speed > rate_limit:
 351                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 352
 353         def report_destination(self, filename):
 354                 """Report destination filename."""
 355                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 356
 357         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 358                 """Report download progress."""
 359                 if self.params.get('noprogress', False):
 360                         return
 361                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 362                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 363
 364         def report_resuming_byte(self, resume_len):
 365                 """Report attemtp to resume at given byte."""
 366                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 367
 368         def report_retry(self, count, retries):
 369                 """Report retry in case of HTTP error 503"""
 370                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
 371
 372         def report_file_already_downloaded(self, file_name):
 373                 """Report file has already been fully downloaded."""
 374                 try:
 375                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 376                 except (UnicodeEncodeError), err:
 377                         self.to_stdout(u'[download] The file has already been downloaded')
 378
 379         def report_unable_to_resume(self):
 380                 """Report it was impossible to resume download."""
 381                 self.to_stdout(u'[download] Unable to resume')
 382
 383         def report_finish(self):
 384                 """Report download finished."""
 385                 if self.params.get('noprogress', False):
 386                         self.to_stdout(u'[download] Download completed')
 387                 else:
 388                         self.to_stdout(u'')
 389
 390         def process_info(self, info_dict):
 391                 """Process a single dictionary returned by an InfoExtractor."""
 392                 # Do nothing else if in simulate mode
 393                 if self.params.get('simulate', False):
 394                         # Verify URL if it's an HTTP one
 395                         if info_dict['url'].startswith('http'):
 396                                 try:
 397                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 398                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 399                                         raise UnavailableFormatError
 400
 401                         # Forced printings
 402                         if self.params.get('forcetitle', False):
 403                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 404                         if self.params.get('forceurl', False):
 405                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 406                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 407                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 408                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 409                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 410
 411                         return
 412
 413                 try:
 414                         template_dict = dict(info_dict)
 415                         template_dict['epoch'] = unicode(long(time.time()))
 416                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 417                         filename = self.params['outtmpl'] % template_dict
 418                 except (ValueError, KeyError), err:
 419                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 420                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 421                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 422                         return
 423
 424                 try:
 425                         self.pmkdir(filename)
 426                 except (OSError, IOError), err:
 427                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 428                         return
 429
 430                 try:
 431                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 432                 except (OSError, IOError), err:
 433                         raise UnavailableFormatError
 434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 435                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 436                         return
 437                 except (ContentTooShortError, ), err:
 438                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 439                         return
 440
 441                 if success:
 442                         try:
 443                                 self.post_process(filename, info_dict)
 444                         except (PostProcessingError), err:
 445                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 446                                 return
 447
 448         def download(self, url_list):
 449                 """Download a given list of URLs."""
 450                 if len(url_list) > 1 and self.fixed_template():
 451                         raise SameFileError(self.params['outtmpl'])
 452
 453                 for url in url_list:
 454                         suitable_found = False
 455                         for ie in self._ies:
 456                                 # Go to next InfoExtractor if not suitable
 457                                 if not ie.suitable(url):
 458                                         continue
 459
 460                                 # Suitable InfoExtractor found
 461                                 suitable_found = True
 462
 463                                 # Extract information from URL and process it
 464                                 ie.extract(url)
 465
 466                                 # Suitable InfoExtractor had been found; go to next URL
 467                                 break
 468
 469                         if not suitable_found:
 470                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 471
 472                 return self._download_retcode
 473
 474         def post_process(self, filename, ie_info):
 475                 """Run the postprocessing chain on the given file."""
 476                 info = dict(ie_info)
 477                 info['filepath'] = filename
 478                 for pp in self._pps:
 479                         info = pp.run(info)
 480                         if info is None:
 481                                 break
 482
 483         def _download_with_rtmpdump(self, filename, url, player_url):
 484                 self.report_destination(filename)
 485
 486                 # Check for rtmpdump first
 487                 try:
 488                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 489                 except (OSError, IOError):
 490                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 491                         return False
 492
 493                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 494                 # the connection was interrumpted and resuming appears to be
 495                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 496                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 497                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 498                 while retval == 2 or retval == 1:
 499                         prevsize = os.path.getsize(filename)
 500                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 501                         time.sleep(5.0) # This seems to be needed
 502                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 503                         cursize = os.path.getsize(filename)
 504                         if prevsize == cursize and retval == 1:
 505                                 break
 506                 if retval == 0:
 507                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 508                         return True
 509                 else:
 510                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 511                         return False
 512
 513         def _do_download(self, filename, url, player_url):
 514                 # Attempt to download using rtmpdump
 515                 if url.startswith('rtmp'):
 516                         return self._download_with_rtmpdump(filename, url, player_url)
 517
 518                 stream = None
 519                 open_mode = 'wb'
 520                 basic_request = urllib2.Request(url, None, std_headers)
 521                 request = urllib2.Request(url, None, std_headers)
 522
 523                 # Establish possible resume length
 524                 if os.path.isfile(filename):
 525                         resume_len = os.path.getsize(filename)
 526                 else:
 527                         resume_len = 0
 528
 529                 # Request parameters in case of being able to resume
 530                 if self.params.get('continuedl', False) and resume_len != 0:
 531                         self.report_resuming_byte(resume_len)
 532                         request.add_header('Range','bytes=%d-' % resume_len)
 533                         open_mode = 'ab'
 534
 535                 count = 0
 536                 retries = self.params.get('retries', 0)
 537                 while True:
 538                         # Establish connection
 539                         try:
 540                                 data = urllib2.urlopen(request)
 541                                 break
 542                         except (urllib2.HTTPError, ), err:
 543                                 if err.code == 503:
 544                                         # Retry in case of HTTP error 503
 545                                         count += 1
 546                                         if count <= retries:
 547                                                 self.report_retry(count, retries)
 548                                                 continue
 549                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
 550                                         raise
 551                                 # Unable to resume
 552                                 data = urllib2.urlopen(basic_request)
 553                                 content_length = data.info()['Content-Length']
 554
 555                                 if content_length is not None and long(content_length) == resume_len:
 556                                         # Because the file had already been fully downloaded
 557                                         self.report_file_already_downloaded(filename)
 558                                         return True
 559                                 else:
 560                                         # Because the server didn't let us
 561                                         self.report_unable_to_resume()
 562                                         open_mode = 'wb'
 563
 564                 data_len = data.info().get('Content-length', None)
 565                 data_len_str = self.format_bytes(data_len)
 566                 byte_counter = 0
 567                 block_size = 1024
 568                 start = time.time()
 569                 while True:
 570                         # Download and write
 571                         before = time.time()
 572                         data_block = data.read(block_size)
 573                         after = time.time()
 574                         data_block_len = len(data_block)
 575                         if data_block_len == 0:
 576                                 break
 577                         byte_counter += data_block_len
 578
 579                         # Open file just in time
 580                         if stream is None:
 581                                 try:
 582                                         (stream, filename) = sanitize_open(filename, open_mode)
 583                                         self.report_destination(filename)
 584                                         self._num_downloads += 1
 585                                 except (OSError, IOError), err:
 586                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 587                                         return False
 588                         try:
 589                                 stream.write(data_block)
 590                         except (IOError, OSError), err:
 591                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 592                         block_size = self.best_block_size(after - before, data_block_len)
 593
 594                         # Progress message
 595                         percent_str = self.calc_percent(byte_counter, data_len)
 596                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 597                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 598                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 599
 600                         # Apply rate limit
 601                         self.slow_down(start, byte_counter)
 602
 603                 self.report_finish()
 604                 if data_len is not None and str(byte_counter) != data_len:
 605                         raise ContentTooShortError(byte_counter, long(data_len))
 606                 return True
 607
 608 class InfoExtractor(object):
 609         """Information Extractor class.
 610
 611         Information extractors are the classes that, given a URL, extract
 612         information from the video (or videos) the URL refers to. This
 613         information includes the real video URL, the video title and simplified
 614         title, author and others. The information is stored in a dictionary
 615         which is then passed to the FileDownloader. The FileDownloader
 616         processes this information possibly downloading the video to the file
 617         system, among other possible outcomes. The dictionaries must include
 618         the following fields:
 619
 620         id:             Video identifier.
 621         url:            Final video URL.
 622         uploader:       Nickname of the video uploader.
 623         title:          Literal title.
 624         stitle:         Simplified title.
 625         ext:            Video filename extension.
 626         format:         Video format.
 627         player_url:     SWF Player URL (may be None).
 628
 629         The following fields are optional. Their primary purpose is to allow
 630         youtube-dl to serve as the backend for a video search function, such
 631         as the one in youtube2mp3.  They are only used when their respective
 632         forced printing functions are called:
 633
 634         thumbnail:      Full URL to a video thumbnail image.
 635         description:    One-line video description.
 636
 637         Subclasses of this one should re-define the _real_initialize() and
 638         _real_extract() methods, as well as the suitable() static method.
 639         Probably, they should also be instantiated and added to the main
 640         downloader.
 641         """
 642
 643         _ready = False
 644         _downloader = None
 645
 646         def __init__(self, downloader=None):
 647                 """Constructor. Receives an optional downloader."""
 648                 self._ready = False
 649                 self.set_downloader(downloader)
 650
 651         @staticmethod
 652         def suitable(url):
 653                 """Receives a URL and returns True if suitable for this IE."""
 654                 return False
 655
 656         def initialize(self):
 657                 """Initializes an instance (authentication, etc)."""
 658                 if not self._ready:
 659                         self._real_initialize()
 660                         self._ready = True
 661
 662         def extract(self, url):
 663                 """Extracts URL information and returns it in list of dicts."""
 664                 self.initialize()
 665                 return self._real_extract(url)
 666
 667         def set_downloader(self, downloader):
 668                 """Sets the downloader for this IE."""
 669                 self._downloader = downloader
 670
 671         def _real_initialize(self):
 672                 """Real initialization process. Redefine in subclasses."""
 673                 pass
 674
 675         def _real_extract(self, url):
 676                 """Real extraction process. Redefine in subclasses."""
 677                 pass
 678
 679 class YoutubeIE(InfoExtractor):
 680         """Information extractor for youtube.com."""
 681
 682         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 683         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 684         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 685         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 686         _NETRC_MACHINE = 'youtube'
 687         # Listed in order of priority for the -b option
 688         _available_formats = ['37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13', None]
 689         _video_extensions = {
 690                 '13': '3gp',
 691                 '17': 'mp4',
 692                 '18': 'mp4',
 693                 '22': 'mp4',
 694                 '37': 'mp4',
 695                 '43': 'webm',
 696                 '45': 'webm',
 697         }
 698
 699         @staticmethod
 700         def suitable(url):
 701                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 702
 703         def report_lang(self):
 704                 """Report attempt to set language."""
 705                 self._downloader.to_stdout(u'[youtube] Setting language')
 706
 707         def report_login(self):
 708                 """Report attempt to log in."""
 709                 self._downloader.to_stdout(u'[youtube] Logging in')
 710
 711         def report_age_confirmation(self):
 712                 """Report attempt to confirm age."""
 713                 self._downloader.to_stdout(u'[youtube] Confirming age')
 714
 715         def report_video_webpage_download(self, video_id):
 716                 """Report attempt to download video webpage."""
 717                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 718
 719         def report_video_info_webpage_download(self, video_id):
 720                 """Report attempt to download video info webpage."""
 721                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 722
 723         def report_information_extraction(self, video_id):
 724                 """Report attempt to extract video information."""
 725                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 726
 727         def report_unavailable_format(self, video_id, format):
 728                 """Report extracted video URL."""
 729                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 730
 731         def report_rtmp_download(self):
 732                 """Indicate the download will use the RTMP protocol."""
 733                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 734
 735         def _real_initialize(self):
 736                 if self._downloader is None:
 737                         return
 738
 739                 username = None
 740                 password = None
 741                 downloader_params = self._downloader.params
 742
 743                 # Attempt to use provided username and password or .netrc data
 744                 if downloader_params.get('username', None) is not None:
 745                         username = downloader_params['username']
 746                         password = downloader_params['password']
 747                 elif downloader_params.get('usenetrc', False):
 748                         try:
 749                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 750                                 if info is not None:
 751                                         username = info[0]
 752                                         password = info[2]
 753                                 else:
 754                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 755                         except (IOError, netrc.NetrcParseError), err:
 756                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 757                                 return
 758
 759                 # Set language
 760                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 761                 try:
 762                         self.report_lang()
 763                         urllib2.urlopen(request).read()
 764                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 765                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 766                         return
 767
 768                 # No authentication to be performed
 769                 if username is None:
 770                         return
 771
 772                 # Log in
 773                 login_form = {
 774                                 'current_form': 'loginForm',
 775                                 'next':         '/',
 776                                 'action_login': 'Log In',
 777                                 'username':     username,
 778                                 'password':     password,
 779                                 }
 780                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 781                 try:
 782                         self.report_login()
 783                         login_results = urllib2.urlopen(request).read()
 784                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 785                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 786                                 return
 787                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 788                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 789                         return
 790
 791                 # Confirm age
 792                 age_form = {
 793                                 'next_url':             '/',
 794                                 'action_confirm':       'Confirm',
 795                                 }
 796                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 797                 try:
 798                         self.report_age_confirmation()
 799                         age_results = urllib2.urlopen(request).read()
 800                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 801                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 802                         return
 803
 804         def _real_extract(self, url):
 805                 # Extract video id from URL
 806                 mobj = re.match(self._VALID_URL, url)
 807                 if mobj is None:
 808                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 809                         return
 810                 video_id = mobj.group(2)
 811
 812                 # Downloader parameters
 813                 best_quality = False
 814                 all_formats = False
 815                 format_param = None
 816                 quality_index = 0
 817                 if self._downloader is not None:
 818                         params = self._downloader.params
 819                         format_param = params.get('format', None)
 820                         if format_param == '0':
 821                                 format_param = self._available_formats[quality_index]
 822                                 best_quality = True
 823                         elif format_param == '-1':
 824                                 format_param = self._available_formats[quality_index]
 825                                 all_formats = True
 826
 827                 while True:
 828                         # Extension
 829                         video_extension = self._video_extensions.get(format_param, 'flv')
 830
 831                         # Get video webpage
 832                         self.report_video_webpage_download(video_id)
 833                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 834                         try:
 835                                 video_webpage = urllib2.urlopen(request).read()
 836                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 837                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 838                                 return
 839
 840                         # Attempt to extract SWF player URL
 841                         mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
 842                         if mobj is not None:
 843                                 player_url = mobj.group(1)
 844                         else:
 845                                 player_url = None
 846
 847                         # Get video info
 848                         self.report_video_info_webpage_download(video_id)
 849                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 850                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 851                                                    % (video_id, el_type))
 852                                 request = urllib2.Request(video_info_url, None, std_headers)
 853                                 try:
 854                                         video_info_webpage = urllib2.urlopen(request).read()
 855                                         video_info = parse_qs(video_info_webpage)
 856                                         if 'token' in video_info:
 857                                                 break
 858                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 859                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 860                                         return
 861                         self.report_information_extraction(video_id)
 862
 863                         # "t" param
 864                         if 'token' not in video_info:
 865                                 # Attempt to see if YouTube has issued an error message
 866                                 if 'reason' not in video_info:
 867                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 868                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 869                                         stream.write(video_info_webpage)
 870                                         stream.close()
 871                                 else:
 872                                         reason = urllib.unquote_plus(video_info['reason'][0])
 873                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 874                                 return
 875                         token = urllib.unquote_plus(video_info['token'][0])
 876                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 877                         if format_param is not None:
 878                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 879
 880                         # Check possible RTMP download
 881                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 882                                 self.report_rtmp_download()
 883                                 video_real_url = video_info['conn'][0]
 884
 885                         # uploader
 886                         if 'author' not in video_info:
 887                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 888                                 return
 889                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 890
 891                         # title
 892                         if 'title' not in video_info:
 893                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 894                                 return
 895                         video_title = urllib.unquote_plus(video_info['title'][0])
 896                         video_title = video_title.decode('utf-8')
 897                         video_title = sanitize_title(video_title)
 898
 899                         # simplified title
 900                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 901                         simple_title = simple_title.strip(ur'_')
 902
 903                         # thumbnail image
 904                         if 'thumbnail_url' not in video_info:
 905                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 906                                 video_thumbnail = ''
 907                         else:   # don't panic if we can't find it
 908                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 909
 910                         # description
 911                         video_description = 'No description available.'
 912                         if self._downloader.params.get('forcedescription', False):
 913                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 914                                 if mobj is not None:
 915                                         video_description = mobj.group(1)
 916
 917                         try:
 918                                 # Process video information
 919                                 self._downloader.process_info({
 920                                         'id':           video_id.decode('utf-8'),
 921                                         'url':          video_real_url.decode('utf-8'),
 922                                         'uploader':     video_uploader.decode('utf-8'),
 923                                         'title':        video_title,
 924                                         'stitle':       simple_title,
 925                                         'ext':          video_extension.decode('utf-8'),
 926                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 927                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 928                                         'description':  video_description.decode('utf-8'),
 929                                         'player_url':   player_url,
 930                                 })
 931
 932                                 if all_formats:
 933                                         quality_index += 1
 934                                         if quality_index == len(self._available_formats):
 935                                                 # None left to get
 936                                                 return
 937                                         else:
 938                                                 format_param = self._available_formats[quality_index]
 939                                                 continue
 940                                 return
 941
 942                         except UnavailableFormatError, err:
 943                                 if best_quality or all_formats:
 944                                         quality_index += 1
 945                                         if quality_index == len(self._available_formats):
 946                                                 # I don't ever expect this to happen
 947                                                 if not all_formats:
 948                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 949                                                 return
 950                                         else:
 951                                                 self.report_unavailable_format(video_id, format_param)
 952                                                 format_param = self._available_formats[quality_index]
 953                                                 continue
 954                                 else:
 955                                         self._downloader.trouble('ERROR: format not available for video')
 956                                         return
 957
 958
 959 class MetacafeIE(InfoExtractor):
 960         """Information Extractor for metacafe.com."""
 961
 962         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 963         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 964         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 965         _youtube_ie = None
 966
 967         def __init__(self, youtube_ie, downloader=None):
 968                 InfoExtractor.__init__(self, downloader)
 969                 self._youtube_ie = youtube_ie
 970
 971         @staticmethod
 972         def suitable(url):
 973                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 974
 975         def report_disclaimer(self):
 976                 """Report disclaimer retrieval."""
 977                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 978
 979         def report_age_confirmation(self):
 980                 """Report attempt to confirm age."""
 981                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 982
 983         def report_download_webpage(self, video_id):
 984                 """Report webpage download."""
 985                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 986
 987         def report_extraction(self, video_id):
 988                 """Report information extraction."""
 989                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 990
 991         def _real_initialize(self):
 992                 # Retrieve disclaimer
 993                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 994                 try:
 995                         self.report_disclaimer()
 996                         disclaimer = urllib2.urlopen(request).read()
 997                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 998                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 999                         return
1000
1001                 # Confirm age
1002                 disclaimer_form = {
1003                         'filters': '0',
1004                         'submit': "Continue - I'm over 18",
1005                         }
1006                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1007                 try:
1008                         self.report_age_confirmation()
1009                         disclaimer = urllib2.urlopen(request).read()
1010                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1011                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1012                         return
1013
1014         def _real_extract(self, url):
1015                 # Extract id and simplified title from URL
1016                 mobj = re.match(self._VALID_URL, url)
1017                 if mobj is None:
1018                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1019                         return
1020
1021                 video_id = mobj.group(1)
1022
1023                 # Check if video comes from YouTube
1024                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1025                 if mobj2 is not None:
1026                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1027                         return
1028
1029                 simple_title = mobj.group(2).decode('utf-8')
1030                 video_extension = 'flv'
1031
1032                 # Retrieve video webpage to extract further information
1033                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1034                 try:
1035                         self.report_download_webpage(video_id)
1036                         webpage = urllib2.urlopen(request).read()
1037                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1038                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1039                         return
1040
1041                 # Extract URL, uploader and title from webpage
1042                 self.report_extraction(video_id)
1043                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1044                 if mobj is None:
1045                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1046                         return
1047                 mediaURL = urllib.unquote(mobj.group(1))
1048
1049                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1050                 #if mobj is None:
1051                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1052                 #       return
1053                 #gdaKey = mobj.group(1)
1054                 #
1055                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1056
1057                 video_url = mediaURL
1058
1059                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1060                 if mobj is None:
1061                         self._downloader.trouble(u'ERROR: unable to extract title')
1062                         return
1063                 video_title = mobj.group(1).decode('utf-8')
1064                 video_title = sanitize_title(video_title)
1065
1066                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1067                 if mobj is None:
1068                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1069                         return
1070                 video_uploader = mobj.group(1)
1071
1072                 try:
1073                         # Process video information
1074                         self._downloader.process_info({
1075                                 'id':           video_id.decode('utf-8'),
1076                                 'url':          video_url.decode('utf-8'),
1077                                 'uploader':     video_uploader.decode('utf-8'),
1078                                 'title':        video_title,
1079                                 'stitle':       simple_title,
1080                                 'ext':          video_extension.decode('utf-8'),
1081                                 'format':       u'NA',
1082                                 'player_url':   None,
1083                         })
1084                 except UnavailableFormatError:
1085                         self._downloader.trouble(u'ERROR: format not available for video')
1086
1087
1088 class DailymotionIE(InfoExtractor):
1089         """Information Extractor for Dailymotion"""
1090
1091         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1092
1093         def __init__(self, downloader=None):
1094                 InfoExtractor.__init__(self, downloader)
1095
1096         @staticmethod
1097         def suitable(url):
1098                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1099
1100         def report_disclaimer(self):
1101                 """Report disclaimer retrieval."""
1102                 self._downloader.to_stdout(u'[dailymotion] Retrieving disclaimer')
1103
1104         def report_age_confirmation(self):
1105                 """Report attempt to confirm age."""
1106                 self._downloader.to_stdout(u'[dailymotion] Confirming age')
1107
1108         def report_download_webpage(self, video_id):
1109                 """Report webpage download."""
1110                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1111
1112         def report_extraction(self, video_id):
1113                 """Report information extraction."""
1114                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1115
1116         def _real_initialize(self):
1117                 return
1118
1119         def _real_extract(self, url):
1120                 # Extract id and simplified title from URL
1121                 mobj = re.match(self._VALID_URL, url)
1122                 if mobj is None:
1123                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1124                         return
1125
1126                 video_id = mobj.group(1)
1127
1128                 simple_title = mobj.group(2).decode('utf-8')
1129                 video_extension = 'flv'
1130
1131                 # Retrieve video webpage to extract further information
1132                 request = urllib2.Request(url)
1133                 try:
1134                         self.report_download_webpage(video_id)
1135                         webpage = urllib2.urlopen(request).read()
1136                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1137                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1138                         return
1139
1140                 # Extract URL, uploader and title from webpage
1141                 self.report_extraction(video_id)
1142                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1143                 if mobj is None:
1144                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1145                         return
1146                 mediaURL = urllib.unquote(mobj.group(1))
1147
1148                 # if needed add http://www.dailymotion.com/ if relative URL
1149
1150                 video_url = mediaURL
1151
1152                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1153                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1154                 if mobj is None:
1155                         self._downloader.trouble(u'ERROR: unable to extract title')
1156                         return
1157                 video_title = mobj.group(1).decode('utf-8')
1158                 video_title = sanitize_title(video_title)
1159
1160                 mobj = re.search(r'(?im)<div class="dmco_html owner"><a class="name" href="/.+">(.+?)</a></div>', webpage)
1161                 if mobj is None:
1162                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1163                         return
1164                 video_uploader = mobj.group(1)
1165
1166                 try:
1167                         # Process video information
1168                         self._downloader.process_info({
1169                                 'id':           video_id.decode('utf-8'),
1170                                 'url':          video_url.decode('utf-8'),
1171                                 'uploader':     video_uploader.decode('utf-8'),
1172                                 'title':        video_title,
1173                                 'stitle':       simple_title,
1174                                 'ext':          video_extension.decode('utf-8'),
1175                                 'format':       u'NA',
1176                                 'player_url':   None,
1177                         })
1178                 except UnavailableFormatError:
1179                         self._downloader.trouble(u'ERROR: format not available for video')
1180
1181 class GoogleIE(InfoExtractor):
1182         """Information extractor for video.google.com."""
1183
1184         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1185
1186         def __init__(self, downloader=None):
1187                 InfoExtractor.__init__(self, downloader)
1188
1189         @staticmethod
1190         def suitable(url):
1191                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1192
1193         def report_download_webpage(self, video_id):
1194                 """Report webpage download."""
1195                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1196
1197         def report_extraction(self, video_id):
1198                 """Report information extraction."""
1199                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1200
1201         def _real_initialize(self):
1202                 return
1203
1204         def _real_extract(self, url):
1205                 # Extract id from URL
1206                 mobj = re.match(self._VALID_URL, url)
1207                 if mobj is None:
1208                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1209                         return
1210
1211                 video_id = mobj.group(1)
1212
1213                 video_extension = 'mp4'
1214
1215                 # Retrieve video webpage to extract further information
1216                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1217                 try:
1218                         self.report_download_webpage(video_id)
1219                         webpage = urllib2.urlopen(request).read()
1220                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1221                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1222                         return
1223
1224                 # Extract URL, uploader, and title from webpage
1225                 self.report_extraction(video_id)
1226                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1227                 if mobj is None:
1228                         video_extension = 'flv'
1229                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1230                 if mobj is None:
1231                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1232                         return
1233                 mediaURL = urllib.unquote(mobj.group(1))
1234                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1235                 mediaURL = mediaURL.replace('\\x26', '\x26')
1236
1237                 video_url = mediaURL
1238
1239                 mobj = re.search(r'<title>(.*)</title>', webpage)
1240                 if mobj is None:
1241                         self._downloader.trouble(u'ERROR: unable to extract title')
1242                         return
1243                 video_title = mobj.group(1).decode('utf-8')
1244                 video_title = sanitize_title(video_title)
1245                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1246
1247                 # Extract video description
1248                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1249                 if mobj is None:
1250                         self._downloader.trouble(u'ERROR: unable to extract video description')
1251                         return
1252                 video_description = mobj.group(1).decode('utf-8')
1253                 if not video_description:
1254                         video_description = 'No description available.'
1255
1256                 # Extract video thumbnail
1257                 if self._downloader.params.get('forcethumbnail', False):
1258                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1259                         try:
1260                                 webpage = urllib2.urlopen(request).read()
1261                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1262                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1263                                 return
1264                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1265                         if mobj is None:
1266                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1267                                 return
1268                         video_thumbnail = mobj.group(1)
1269                 else:   # we need something to pass to process_info
1270                         video_thumbnail = ''
1271
1272
1273                 try:
1274                         # Process video information
1275                         self._downloader.process_info({
1276                                 'id':           video_id.decode('utf-8'),
1277                                 'url':          video_url.decode('utf-8'),
1278                                 'uploader':     u'NA',
1279                                 'title':        video_title,
1280                                 'stitle':       simple_title,
1281                                 'ext':          video_extension.decode('utf-8'),
1282                                 'format':       u'NA',
1283                                 'player_url':   None,
1284                         })
1285                 except UnavailableFormatError:
1286                         self._downloader.trouble(u'ERROR: format not available for video')
1287
1288
1289 class PhotobucketIE(InfoExtractor):
1290         """Information extractor for photobucket.com."""
1291
1292         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1293
1294         def __init__(self, downloader=None):
1295                 InfoExtractor.__init__(self, downloader)
1296
1297         @staticmethod
1298         def suitable(url):
1299                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1300
1301         def report_download_webpage(self, video_id):
1302                 """Report webpage download."""
1303                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1304
1305         def report_extraction(self, video_id):
1306                 """Report information extraction."""
1307                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1308
1309         def _real_initialize(self):
1310                 return
1311
1312         def _real_extract(self, url):
1313                 # Extract id from URL
1314                 mobj = re.match(self._VALID_URL, url)
1315                 if mobj is None:
1316                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1317                         return
1318
1319                 video_id = mobj.group(1)
1320
1321                 video_extension = 'flv'
1322
1323                 # Retrieve video webpage to extract further information
1324                 request = urllib2.Request(url)
1325                 try:
1326                         self.report_download_webpage(video_id)
1327                         webpage = urllib2.urlopen(request).read()
1328                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1329                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1330                         return
1331
1332                 # Extract URL, uploader, and title from webpage
1333                 self.report_extraction(video_id)
1334                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1335                 if mobj is None:
1336                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1337                         return
1338                 mediaURL = urllib.unquote(mobj.group(1))
1339
1340                 video_url = mediaURL
1341
1342                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1343                 if mobj is None:
1344                         self._downloader.trouble(u'ERROR: unable to extract title')
1345                         return
1346                 video_title = mobj.group(1).decode('utf-8')
1347                 video_title = sanitize_title(video_title)
1348                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1349
1350                 video_uploader = mobj.group(2).decode('utf-8')
1351
1352                 try:
1353                         # Process video information
1354                         self._downloader.process_info({
1355                                 'id':           video_id.decode('utf-8'),
1356                                 'url':          video_url.decode('utf-8'),
1357                                 'uploader':     video_uploader,
1358                                 'title':        video_title,
1359                                 'stitle':       simple_title,
1360                                 'ext':          video_extension.decode('utf-8'),
1361                                 'format':       u'NA',
1362                                 'player_url':   None,
1363                         })
1364                 except UnavailableFormatError:
1365                         self._downloader.trouble(u'ERROR: format not available for video')
1366
1367
1368 class YahooIE(InfoExtractor):
1369         """Information extractor for video.yahoo.com."""
1370
1371         # _VALID_URL matches all Yahoo! Video URLs
1372         # _VPAGE_URL matches only the extractable '/watch/' URLs
1373         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1374         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1375
1376         def __init__(self, downloader=None):
1377                 InfoExtractor.__init__(self, downloader)
1378
1379         @staticmethod
1380         def suitable(url):
1381                 return (re.match(YahooIE._VALID_URL, url) is not None)
1382
1383         def report_download_webpage(self, video_id):
1384                 """Report webpage download."""
1385                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1386
1387         def report_extraction(self, video_id):
1388                 """Report information extraction."""
1389                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1390
1391         def _real_initialize(self):
1392                 return
1393
1394         def _real_extract(self, url):
1395                 # Extract ID from URL
1396                 mobj = re.match(self._VALID_URL, url)
1397                 if mobj is None:
1398                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1399                         return
1400
1401                 video_id = mobj.group(2)
1402                 video_extension = 'flv'
1403
1404                 # Rewrite valid but non-extractable URLs as
1405                 # extractable English language /watch/ URLs
1406                 if re.match(self._VPAGE_URL, url) is None:
1407                         request = urllib2.Request(url)
1408                         try:
1409                                 webpage = urllib2.urlopen(request).read()
1410                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1411                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1412                                 return
1413
1414                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1415                         if mobj is None:
1416                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1417                                 return
1418                         yahoo_id = mobj.group(1)
1419
1420                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1421                         if mobj is None:
1422                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1423                                 return
1424                         yahoo_vid = mobj.group(1)
1425
1426                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1427                         return self._real_extract(url)
1428
1429                 # Retrieve video webpage to extract further information
1430                 request = urllib2.Request(url)
1431                 try:
1432                         self.report_download_webpage(video_id)
1433                         webpage = urllib2.urlopen(request).read()
1434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1435                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1436                         return
1437
1438                 # Extract uploader and title from webpage
1439                 self.report_extraction(video_id)
1440                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1441                 if mobj is None:
1442                         self._downloader.trouble(u'ERROR: unable to extract video title')
1443                         return
1444                 video_title = mobj.group(1).decode('utf-8')
1445                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1446
1447                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1448                 if mobj is None:
1449                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1450                         return
1451                 video_uploader = mobj.group(1).decode('utf-8')
1452
1453                 # Extract video thumbnail
1454                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1455                 if mobj is None:
1456                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1457                         return
1458                 video_thumbnail = mobj.group(1).decode('utf-8')
1459
1460                 # Extract video description
1461                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: unable to extract video description')
1464                         return
1465                 video_description = mobj.group(1).decode('utf-8')
1466                 if not video_description: video_description = 'No description available.'
1467
1468                 # Extract video height and width
1469                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1470                 if mobj is None:
1471                         self._downloader.trouble(u'ERROR: unable to extract video height')
1472                         return
1473                 yv_video_height = mobj.group(1)
1474
1475                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1476                 if mobj is None:
1477                         self._downloader.trouble(u'ERROR: unable to extract video width')
1478                         return
1479                 yv_video_width = mobj.group(1)
1480
1481                 # Retrieve video playlist to extract media URL
1482                 # I'm not completely sure what all these options are, but we
1483                 # seem to need most of them, otherwise the server sends a 401.
1484                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1485                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1486                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1487                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1488                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1489                 try:
1490                         self.report_download_webpage(video_id)
1491                         webpage = urllib2.urlopen(request).read()
1492                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1493                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1494                         return
1495
1496                 # Extract media URL from playlist XML
1497                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1498                 if mobj is None:
1499                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1500                         return
1501                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1502                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1503
1504                 try:
1505                         # Process video information
1506                         self._downloader.process_info({
1507                                 'id':           video_id.decode('utf-8'),
1508                                 'url':          video_url,
1509                                 'uploader':     video_uploader,
1510                                 'title':        video_title,
1511                                 'stitle':       simple_title,
1512                                 'ext':          video_extension.decode('utf-8'),
1513                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1514                                 'description':  video_description,
1515                                 'thumbnail':    video_thumbnail,
1516                                 'description':  video_description,
1517                                 'player_url':   None,
1518                         })
1519                 except UnavailableFormatError:
1520                         self._downloader.trouble(u'ERROR: format not available for video')
1521
1522
1523 class GenericIE(InfoExtractor):
1524         """Generic last-resort information extractor."""
1525
1526         def __init__(self, downloader=None):
1527                 InfoExtractor.__init__(self, downloader)
1528
1529         @staticmethod
1530         def suitable(url):
1531                 return True
1532
1533         def report_download_webpage(self, video_id):
1534                 """Report webpage download."""
1535                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1536                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1537
1538         def report_extraction(self, video_id):
1539                 """Report information extraction."""
1540                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1541
1542         def _real_initialize(self):
1543                 return
1544
1545         def _real_extract(self, url):
1546                 video_id = url.split('/')[-1]
1547                 request = urllib2.Request(url)
1548                 try:
1549                         self.report_download_webpage(video_id)
1550                         webpage = urllib2.urlopen(request).read()
1551                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1552                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1553                         return
1554                 except ValueError, err:
1555                         # since this is the last-resort InfoExtractor, if
1556                         # this error is thrown, it'll be thrown here
1557                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1558                         return
1559
1560                 # Start with something easy: JW Player in SWFObject
1561                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1562                 if mobj is None:
1563                         # Broaden the search a little bit
1564                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1565                 if mobj is None:
1566                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1567                         return
1568
1569                 # It's possible that one of the regexes
1570                 # matched, but returned an empty group:
1571                 if mobj.group(1) is None:
1572                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1573                         return
1574
1575                 video_url = urllib.unquote(mobj.group(1))
1576                 video_id  = os.path.basename(video_url)
1577
1578                 # here's a fun little line of code for you:
1579                 video_extension = os.path.splitext(video_id)[1][1:]
1580                 video_id        = os.path.splitext(video_id)[0]
1581
1582                 # it's tempting to parse this further, but you would
1583                 # have to take into account all the variations like
1584                 #   Video Title - Site Name
1585                 #   Site Name | Video Title
1586                 #   Video Title - Tagline | Site Name
1587                 # and so on and so forth; it's just not practical
1588                 mobj = re.search(r'<title>(.*)</title>', webpage)
1589                 if mobj is None:
1590                         self._downloader.trouble(u'ERROR: unable to extract title')
1591                         return
1592                 video_title = mobj.group(1).decode('utf-8')
1593                 video_title = sanitize_title(video_title)
1594                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1595
1596                 # video uploader is domain name
1597                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1598                 if mobj is None:
1599                         self._downloader.trouble(u'ERROR: unable to extract title')
1600                         return
1601                 video_uploader = mobj.group(1).decode('utf-8')
1602
1603                 try:
1604                         # Process video information
1605                         self._downloader.process_info({
1606                                 'id':           video_id.decode('utf-8'),
1607                                 'url':          video_url.decode('utf-8'),
1608                                 'uploader':     video_uploader,
1609                                 'title':        video_title,
1610                                 'stitle':       simple_title,
1611                                 'ext':          video_extension.decode('utf-8'),
1612                                 'format':       u'NA',
1613                                 'player_url':   None,
1614                         })
1615                 except UnavailableFormatError:
1616                         self._downloader.trouble(u'ERROR: format not available for video')
1617
1618
1619 class YoutubeSearchIE(InfoExtractor):
1620         """Information Extractor for YouTube search queries."""
1621         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1622         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1623         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1624         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1625         _youtube_ie = None
1626         _max_youtube_results = 1000
1627
1628         def __init__(self, youtube_ie, downloader=None):
1629                 InfoExtractor.__init__(self, downloader)
1630                 self._youtube_ie = youtube_ie
1631
1632         @staticmethod
1633         def suitable(url):
1634                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1635
1636         def report_download_page(self, query, pagenum):
1637                 """Report attempt to download playlist page with given number."""
1638                 query = query.decode(preferredencoding())
1639                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1640
1641         def _real_initialize(self):
1642                 self._youtube_ie.initialize()
1643
1644         def _real_extract(self, query):
1645                 mobj = re.match(self._VALID_QUERY, query)
1646                 if mobj is None:
1647                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1648                         return
1649
1650                 prefix, query = query.split(':')
1651                 prefix = prefix[8:]
1652                 query  = query.encode('utf-8')
1653                 if prefix == '':
1654                         self._download_n_results(query, 1)
1655                         return
1656                 elif prefix == 'all':
1657                         self._download_n_results(query, self._max_youtube_results)
1658                         return
1659                 else:
1660                         try:
1661                                 n = long(prefix)
1662                                 if n <= 0:
1663                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1664                                         return
1665                                 elif n > self._max_youtube_results:
1666                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1667                                         n = self._max_youtube_results
1668                                 self._download_n_results(query, n)
1669                                 return
1670                         except ValueError: # parsing prefix as integer fails
1671                                 self._download_n_results(query, 1)
1672                                 return
1673
1674         def _download_n_results(self, query, n):
1675                 """Downloads a specified number of results for a query"""
1676
1677                 video_ids = []
1678                 already_seen = set()
1679                 pagenum = 1
1680
1681                 while True:
1682                         self.report_download_page(query, pagenum)
1683                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1684                         request = urllib2.Request(result_url, None, std_headers)
1685                         try:
1686                                 page = urllib2.urlopen(request).read()
1687                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1688                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1689                                 return
1690
1691                         # Extract video identifiers
1692                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1693                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1694                                 if video_id not in already_seen:
1695                                         video_ids.append(video_id)
1696                                         already_seen.add(video_id)
1697                                         if len(video_ids) == n:
1698                                                 # Specified n videos reached
1699                                                 for id in video_ids:
1700                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1701                                                 return
1702
1703                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1704                                 for id in video_ids:
1705                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1706                                 return
1707
1708                         pagenum = pagenum + 1
1709
1710 class GoogleSearchIE(InfoExtractor):
1711         """Information Extractor for Google Video search queries."""
1712         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1713         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1714         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1715         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1716         _google_ie = None
1717         _max_google_results = 1000
1718
1719         def __init__(self, google_ie, downloader=None):
1720                 InfoExtractor.__init__(self, downloader)
1721                 self._google_ie = google_ie
1722
1723         @staticmethod
1724         def suitable(url):
1725                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1726
1727         def report_download_page(self, query, pagenum):
1728                 """Report attempt to download playlist page with given number."""
1729                 query = query.decode(preferredencoding())
1730                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1731
1732         def _real_initialize(self):
1733                 self._google_ie.initialize()
1734
1735         def _real_extract(self, query):
1736                 mobj = re.match(self._VALID_QUERY, query)
1737                 if mobj is None:
1738                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1739                         return
1740
1741                 prefix, query = query.split(':')
1742                 prefix = prefix[8:]
1743                 query  = query.encode('utf-8')
1744                 if prefix == '':
1745                         self._download_n_results(query, 1)
1746                         return
1747                 elif prefix == 'all':
1748                         self._download_n_results(query, self._max_google_results)
1749                         return
1750                 else:
1751                         try:
1752                                 n = long(prefix)
1753                                 if n <= 0:
1754                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1755                                         return
1756                                 elif n > self._max_google_results:
1757                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1758                                         n = self._max_google_results
1759                                 self._download_n_results(query, n)
1760                                 return
1761                         except ValueError: # parsing prefix as integer fails
1762                                 self._download_n_results(query, 1)
1763                                 return
1764
1765         def _download_n_results(self, query, n):
1766                 """Downloads a specified number of results for a query"""
1767
1768                 video_ids = []
1769                 already_seen = set()
1770                 pagenum = 1
1771
1772                 while True:
1773                         self.report_download_page(query, pagenum)
1774                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1775                         request = urllib2.Request(result_url, None, std_headers)
1776                         try:
1777                                 page = urllib2.urlopen(request).read()
1778                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1779                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1780                                 return
1781
1782                         # Extract video identifiers
1783                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1784                                 video_id = mobj.group(1)
1785                                 if video_id not in already_seen:
1786                                         video_ids.append(video_id)
1787                                         already_seen.add(video_id)
1788                                         if len(video_ids) == n:
1789                                                 # Specified n videos reached
1790                                                 for id in video_ids:
1791                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1792                                                 return
1793
1794                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1795                                 for id in video_ids:
1796                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1797                                 return
1798
1799                         pagenum = pagenum + 1
1800
1801 class YahooSearchIE(InfoExtractor):
1802         """Information Extractor for Yahoo! Video search queries."""
1803         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1804         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1805         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1806         _MORE_PAGES_INDICATOR = r'\s*Next'
1807         _yahoo_ie = None
1808         _max_yahoo_results = 1000
1809
1810         def __init__(self, yahoo_ie, downloader=None):
1811                 InfoExtractor.__init__(self, downloader)
1812                 self._yahoo_ie = yahoo_ie
1813
1814         @staticmethod
1815         def suitable(url):
1816                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1817
1818         def report_download_page(self, query, pagenum):
1819                 """Report attempt to download playlist page with given number."""
1820                 query = query.decode(preferredencoding())
1821                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1822
1823         def _real_initialize(self):
1824                 self._yahoo_ie.initialize()
1825
1826         def _real_extract(self, query):
1827                 mobj = re.match(self._VALID_QUERY, query)
1828                 if mobj is None:
1829                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1830                         return
1831
1832                 prefix, query = query.split(':')
1833                 prefix = prefix[8:]
1834                 query  = query.encode('utf-8')
1835                 if prefix == '':
1836                         self._download_n_results(query, 1)
1837                         return
1838                 elif prefix == 'all':
1839                         self._download_n_results(query, self._max_yahoo_results)
1840                         return
1841                 else:
1842                         try:
1843                                 n = long(prefix)
1844                                 if n <= 0:
1845                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1846                                         return
1847                                 elif n > self._max_yahoo_results:
1848                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1849                                         n = self._max_yahoo_results
1850                                 self._download_n_results(query, n)
1851                                 return
1852                         except ValueError: # parsing prefix as integer fails
1853                                 self._download_n_results(query, 1)
1854                                 return
1855
1856         def _download_n_results(self, query, n):
1857                 """Downloads a specified number of results for a query"""
1858
1859                 video_ids = []
1860                 already_seen = set()
1861                 pagenum = 1
1862
1863                 while True:
1864                         self.report_download_page(query, pagenum)
1865                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1866                         request = urllib2.Request(result_url, None, std_headers)
1867                         try:
1868                                 page = urllib2.urlopen(request).read()
1869                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1870                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1871                                 return
1872
1873                         # Extract video identifiers
1874                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1875                                 video_id = mobj.group(1)
1876                                 if video_id not in already_seen:
1877                                         video_ids.append(video_id)
1878                                         already_seen.add(video_id)
1879                                         if len(video_ids) == n:
1880                                                 # Specified n videos reached
1881                                                 for id in video_ids:
1882                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1883                                                 return
1884
1885                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1886                                 for id in video_ids:
1887                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1888                                 return
1889
1890                         pagenum = pagenum + 1
1891
1892 class YoutubePlaylistIE(InfoExtractor):
1893         """Information Extractor for YouTube playlists."""
1894
1895         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1896         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1897         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1898         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1899         _youtube_ie = None
1900
1901         def __init__(self, youtube_ie, downloader=None):
1902                 InfoExtractor.__init__(self, downloader)
1903                 self._youtube_ie = youtube_ie
1904
1905         @staticmethod
1906         def suitable(url):
1907                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1908
1909         def report_download_page(self, playlist_id, pagenum):
1910                 """Report attempt to download playlist page with given number."""
1911                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1912
1913         def _real_initialize(self):
1914                 self._youtube_ie.initialize()
1915
1916         def _real_extract(self, url):
1917                 # Extract playlist id
1918                 mobj = re.match(self._VALID_URL, url)
1919                 if mobj is None:
1920                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1921                         return
1922
1923                 # Download playlist pages
1924                 playlist_id = mobj.group(1)
1925                 video_ids = []
1926                 pagenum = 1
1927
1928                 while True:
1929                         self.report_download_page(playlist_id, pagenum)
1930                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1931                         try:
1932                                 page = urllib2.urlopen(request).read()
1933                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1934                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1935                                 return
1936
1937                         # Extract video identifiers
1938                         ids_in_page = []
1939                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1940                                 if mobj.group(1) not in ids_in_page:
1941                                         ids_in_page.append(mobj.group(1))
1942                         video_ids.extend(ids_in_page)
1943
1944                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1945                                 break
1946                         pagenum = pagenum + 1
1947
1948                 for id in video_ids:
1949                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1950                 return
1951
1952 class YoutubeUserIE(InfoExtractor):
1953         """Information Extractor for YouTube users."""
1954
1955         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1956         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1957         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1958         _youtube_ie = None
1959
1960         def __init__(self, youtube_ie, downloader=None):
1961                 InfoExtractor.__init__(self, downloader)
1962                 self._youtube_ie = youtube_ie
1963
1964         @staticmethod
1965         def suitable(url):
1966                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1967
1968         def report_download_page(self, username):
1969                 """Report attempt to download user page."""
1970                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1971
1972         def _real_initialize(self):
1973                 self._youtube_ie.initialize()
1974
1975         def _real_extract(self, url):
1976                 # Extract username
1977                 mobj = re.match(self._VALID_URL, url)
1978                 if mobj is None:
1979                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1980                         return
1981
1982                 # Download user page
1983                 username = mobj.group(1)
1984                 video_ids = []
1985                 pagenum = 1
1986
1987                 self.report_download_page(username)
1988                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1989                 try:
1990                         page = urllib2.urlopen(request).read()
1991                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1992                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1993                         return
1994
1995                 # Extract video identifiers
1996                 ids_in_page = []
1997
1998                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1999                         if mobj.group(1) not in ids_in_page:
2000                                 ids_in_page.append(mobj.group(1))
2001                 video_ids.extend(ids_in_page)
2002
2003                 for id in video_ids:
2004                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2005                 return
2006
2007 class PostProcessor(object):
2008         """Post Processor class.
2009
2010         PostProcessor objects can be added to downloaders with their
2011         add_post_processor() method. When the downloader has finished a
2012         successful download, it will take its internal chain of PostProcessors
2013         and start calling the run() method on each one of them, first with
2014         an initial argument and then with the returned value of the previous
2015         PostProcessor.
2016
2017         The chain will be stopped if one of them ever returns None or the end
2018         of the chain is reached.
2019
2020         PostProcessor objects follow a "mutual registration" process similar
2021         to InfoExtractor objects.
2022         """
2023
2024         _downloader = None
2025
2026         def __init__(self, downloader=None):
2027                 self._downloader = downloader
2028
2029         def set_downloader(self, downloader):
2030                 """Sets the downloader for this PP."""
2031                 self._downloader = downloader
2032
2033         def run(self, information):
2034                 """Run the PostProcessor.
2035
2036                 The "information" argument is a dictionary like the ones
2037                 composed by InfoExtractors. The only difference is that this
2038                 one has an extra field called "filepath" that points to the
2039                 downloaded file.
2040
2041                 When this method returns None, the postprocessing chain is
2042                 stopped. However, this method may return an information
2043                 dictionary that will be passed to the next postprocessing
2044                 object in the chain. It can be the one it received after
2045                 changing some fields.
2046
2047                 In addition, this method may raise a PostProcessingError
2048                 exception that will be taken into account by the downloader
2049                 it was called from.
2050                 """
2051                 return information # by default, do nothing
2052
2053 ### MAIN PROGRAM ###
2054 if __name__ == '__main__':
2055         try:
2056                 # Modules needed only when running the main program
2057                 import getpass
2058                 import optparse
2059
2060                 # Function to update the program file with the latest version from bitbucket.org
2061                 def update_self(downloader, filename):
2062                         # Note: downloader only used for options
2063                         if not os.access (filename, os.W_OK):
2064                                 sys.exit('ERROR: no write permissions on %s' % filename)
2065
2066                         downloader.to_stdout('Updating to latest stable version...')
2067                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2068                         latest_version = urllib.urlopen(latest_url).read().strip()
2069                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2070                         newcontent = urllib.urlopen(prog_url).read()
2071                         stream = open(filename, 'w')
2072                         stream.write(newcontent)
2073                         stream.close()
2074                         downloader.to_stdout('Updated to version %s' % latest_version)
2075
2076                 # General configuration
2077                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2078                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2079                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2080
2081                 # Parse command line
2082                 parser = optparse.OptionParser(
2083                         usage='Usage: %prog [options] url...',
2084                         version='2010.06.06',
2085                         conflict_handler='resolve',
2086                 )
2087
2088                 parser.add_option('-h', '--help',
2089                                 action='help', help='print this help text and exit')
2090                 parser.add_option('-v', '--version',
2091                                 action='version', help='print program version and exit')
2092                 parser.add_option('-U', '--update',
2093                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2094                 parser.add_option('-i', '--ignore-errors',
2095                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2096                 parser.add_option('-r', '--rate-limit',
2097                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2098                 parser.add_option('-R', '--retries',
2099                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2100
2101                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2102                 authentication.add_option('-u', '--username',
2103                                 dest='username', metavar='USERNAME', help='account username')
2104                 authentication.add_option('-p', '--password',
2105                                 dest='password', metavar='PASSWORD', help='account password')
2106                 authentication.add_option('-n', '--netrc',
2107                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2108                 parser.add_option_group(authentication)
2109
2110                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2111                 video_format.add_option('-f', '--format',
2112                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2113                 video_format.add_option('-b', '--best-quality',
2114                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
2115                 video_format.add_option('-m', '--mobile-version',
2116                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2117                 video_format.add_option('-d', '--high-def',
2118                                 action='store_const', dest='format', help='alias for -f 22', const='22')
2119                 video_format.add_option('--all-formats',
2120                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2121                 parser.add_option_group(video_format)
2122
2123                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2124                 verbosity.add_option('-q', '--quiet',
2125                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2126                 verbosity.add_option('-s', '--simulate',
2127                                 action='store_true', dest='simulate', help='do not download video', default=False)
2128                 verbosity.add_option('-g', '--get-url',
2129                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2130                 verbosity.add_option('-e', '--get-title',
2131                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2132                 verbosity.add_option('--get-thumbnail',
2133                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2134                 verbosity.add_option('--get-description',
2135                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2136                 verbosity.add_option('--no-progress',
2137                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2138                 parser.add_option_group(verbosity)
2139
2140                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2141                 filesystem.add_option('-t', '--title',
2142                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2143                 filesystem.add_option('-l', '--literal',
2144                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2145                 filesystem.add_option('-o', '--output',
2146                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2147                 filesystem.add_option('-a', '--batch-file',
2148                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2149                 filesystem.add_option('-w', '--no-overwrites',
2150                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2151                 filesystem.add_option('-c', '--continue',
2152                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2153                 parser.add_option_group(filesystem)
2154
2155                 (opts, args) = parser.parse_args()
2156
2157                 # Batch file verification
2158                 batchurls = []
2159                 if opts.batchfile is not None:
2160                         try:
2161                                 if opts.batchfile == '-':
2162                                         batchfd = sys.stdin
2163                                 else:
2164                                         batchfd = open(opts.batchfile, 'r')
2165                                 batchurls = batchfd.readlines()
2166                                 batchurls = [x.strip() for x in batchurls]
2167                                 batchurls = [x for x in batchurls if len(x) > 0]
2168                         except IOError:
2169                                 sys.exit(u'ERROR: batch file could not be read')
2170                 all_urls = batchurls + args
2171
2172                 # Conflicting, missing and erroneous options
2173                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2174                         parser.error(u'using .netrc conflicts with giving username/password')
2175                 if opts.password is not None and opts.username is None:
2176                         parser.error(u'account username missing')
2177                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2178                         parser.error(u'using output template conflicts with using title or literal title')
2179                 if opts.usetitle and opts.useliteral:
2180                         parser.error(u'using title conflicts with using literal title')
2181                 if opts.username is not None and opts.password is None:
2182                         opts.password = getpass.getpass(u'Type account password and press return:')
2183                 if opts.ratelimit is not None:
2184                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2185                         if numeric_limit is None:
2186                                 parser.error(u'invalid rate limit specified')
2187                         opts.ratelimit = numeric_limit
2188                 if opts.retries is not None:
2189                         try:
2190                                 opts.retries = long(opts.retries)
2191                         except (TypeError, ValueError), err:
2192                                 parser.error(u'invalid retry count specified')
2193
2194                 # Information extractors
2195                 youtube_ie = YoutubeIE()
2196                 metacafe_ie = MetacafeIE(youtube_ie)
2197                 dailymotion_ie = DailymotionIE()
2198                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2199                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2200                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2201                 google_ie = GoogleIE()
2202                 google_search_ie = GoogleSearchIE(google_ie)
2203                 photobucket_ie = PhotobucketIE()
2204                 yahoo_ie = YahooIE()
2205                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2206                 generic_ie = GenericIE()
2207
2208                 # File downloader
2209                 fd = FileDownloader({
2210                         'usenetrc': opts.usenetrc,
2211                         'username': opts.username,
2212                         'password': opts.password,
2213                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2214                         'forceurl': opts.geturl,
2215                         'forcetitle': opts.gettitle,
2216                         'forcethumbnail': opts.getthumbnail,
2217                         'forcedescription': opts.getdescription,
2218                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2219                         'format': opts.format,
2220                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2221                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2222                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2223                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2224                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2225                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2226                                 or u'%(id)s.%(ext)s'),
2227                         'ignoreerrors': opts.ignoreerrors,
2228                         'ratelimit': opts.ratelimit,
2229                         'nooverwrites': opts.nooverwrites,
2230                         'retries': opts.retries,
2231                         'continuedl': opts.continue_dl,
2232                         'noprogress': opts.noprogress,
2233                         })
2234                 fd.add_info_extractor(youtube_search_ie)
2235                 fd.add_info_extractor(youtube_pl_ie)
2236                 fd.add_info_extractor(youtube_user_ie)
2237                 fd.add_info_extractor(metacafe_ie)
2238                 fd.add_info_extractor(dailymotion_ie)
2239                 fd.add_info_extractor(youtube_ie)
2240                 fd.add_info_extractor(google_ie)
2241                 fd.add_info_extractor(google_search_ie)
2242                 fd.add_info_extractor(photobucket_ie)
2243                 fd.add_info_extractor(yahoo_ie)
2244                 fd.add_info_extractor(yahoo_search_ie)
2245
2246                 # This must come last since it's the
2247                 # fallback if none of the others work
2248                 fd.add_info_extractor(generic_ie)
2249
2250                 # Update version
2251                 if opts.update_self:
2252                         update_self(fd, sys.argv[0])
2253
2254                 # Maybe do nothing
2255                 if len(all_urls) < 1:
2256                         if not opts.update_self:
2257                                 parser.error(u'you must provide at least one URL')
2258                         else:
2259                                 sys.exit()
2260                 retcode = fd.download(all_urls)
2261                 sys.exit(retcode)
2262
2263         except DownloadError:
2264                 sys.exit(1)
2265         except SameFileError:
2266                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2267         except KeyboardInterrupt:
2268                 sys.exit(u'\nERROR: Interrupted by user')