_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         retries:        Number of times to retry for HTTP error 503
 197         continuedl:     Try to continue downloads if possible.
 198         noprogress:     Do not print the progress bar.
 199         """
 200
 201         params = None
 202         _ies = []
 203         _pps = []
 204         _download_retcode = None
 205         _num_downloads = None
 206
 207         def __init__(self, params):
 208                 """Create a FileDownloader object with the given options."""
 209                 self._ies = []
 210                 self._pps = []
 211                 self._download_retcode = 0
 212                 self._num_downloads = 0
 213                 self.params = params
 214
 215         @staticmethod
 216         def pmkdir(filename):
 217                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 218                 components = filename.split(os.sep)
 219                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 220                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 221                 for dir in aggregate:
 222                         if not os.path.exists(dir):
 223                                 os.mkdir(dir)
 224
 225         @staticmethod
 226         def format_bytes(bytes):
 227                 if bytes is None:
 228                         return 'N/A'
 229                 if type(bytes) is str:
 230                         bytes = float(bytes)
 231                 if bytes == 0.0:
 232                         exponent = 0
 233                 else:
 234                         exponent = long(math.log(bytes, 1024.0))
 235                 suffix = 'bkMGTPEZY'[exponent]
 236                 converted = float(bytes) / float(1024**exponent)
 237                 return '%.2f%s' % (converted, suffix)
 238
 239         @staticmethod
 240         def calc_percent(byte_counter, data_len):
 241                 if data_len is None:
 242                         return '---.-%'
 243                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 244
 245         @staticmethod
 246         def calc_eta(start, now, total, current):
 247                 if total is None:
 248                         return '--:--'
 249                 dif = now - start
 250                 if current == 0 or dif < 0.001: # One millisecond
 251                         return '--:--'
 252                 rate = float(current) / dif
 253                 eta = long((float(total) - float(current)) / rate)
 254                 (eta_mins, eta_secs) = divmod(eta, 60)
 255                 if eta_mins > 99:
 256                         return '--:--'
 257                 return '%02d:%02d' % (eta_mins, eta_secs)
 258
 259         @staticmethod
 260         def calc_speed(start, now, bytes):
 261                 dif = now - start
 262                 if bytes == 0 or dif < 0.001: # One millisecond
 263                         return '%10s' % '---b/s'
 264                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 265
 266         @staticmethod
 267         def best_block_size(elapsed_time, bytes):
 268                 new_min = max(bytes / 2.0, 1.0)
 269                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 270                 if elapsed_time < 0.001:
 271                         return long(new_max)
 272                 rate = bytes / elapsed_time
 273                 if rate > new_max:
 274                         return long(new_max)
 275                 if rate < new_min:
 276                         return long(new_min)
 277                 return long(rate)
 278
 279         @staticmethod
 280         def parse_bytes(bytestr):
 281                 """Parse a string indicating a byte quantity into a long integer."""
 282                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 283                 if matchobj is None:
 284                         return None
 285                 number = float(matchobj.group(1))
 286                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 287                 return long(round(number * multiplier))
 288
 289         @staticmethod
 290         def verify_url(url):
 291                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 292                 request = urllib2.Request(url, None, std_headers)
 293                 data = urllib2.urlopen(request)
 294                 data.read(1)
 295                 url = data.geturl()
 296                 data.close()
 297                 return url
 298
 299         def add_info_extractor(self, ie):
 300                 """Add an InfoExtractor object to the end of the list."""
 301                 self._ies.append(ie)
 302                 ie.set_downloader(self)
 303
 304         def add_post_processor(self, pp):
 305                 """Add a PostProcessor object to the end of the chain."""
 306                 self._pps.append(pp)
 307                 pp.set_downloader(self)
 308
 309         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 310                 """Print message to stdout if not in quiet mode."""
 311                 try:
 312                         if not self.params.get('quiet', False):
 313                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 314                         sys.stdout.flush()
 315                 except (UnicodeEncodeError), err:
 316                         if not ignore_encoding_errors:
 317                                 raise
 318
 319         def to_stderr(self, message):
 320                 """Print message to stderr."""
 321                 print >>sys.stderr, message.encode(preferredencoding())
 322
 323         def fixed_template(self):
 324                 """Checks if the output template is fixed."""
 325                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 326
 327         def trouble(self, message=None):
 328                 """Determine action to take when a download problem appears.
 329
 330                 Depending on if the downloader has been configured to ignore
 331                 download errors or not, this method may throw an exception or
 332                 not when errors are found, after printing the message.
 333                 """
 334                 if message is not None:
 335                         self.to_stderr(message)
 336                 if not self.params.get('ignoreerrors', False):
 337                         raise DownloadError(message)
 338                 self._download_retcode = 1
 339
 340         def slow_down(self, start_time, byte_counter):
 341                 """Sleep if the download speed is over the rate limit."""
 342                 rate_limit = self.params.get('ratelimit', None)
 343                 if rate_limit is None or byte_counter == 0:
 344                         return
 345                 now = time.time()
 346                 elapsed = now - start_time
 347                 if elapsed <= 0.0:
 348                         return
 349                 speed = float(byte_counter) / elapsed
 350                 if speed > rate_limit:
 351                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 352
 353         def report_destination(self, filename):
 354                 """Report destination filename."""
 355                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 356
 357         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 358                 """Report download progress."""
 359                 if self.params.get('noprogress', False):
 360                         return
 361                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 362                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 363
 364         def report_resuming_byte(self, resume_len):
 365                 """Report attemtp to resume at given byte."""
 366                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 367
 368         def report_retry(self, count, retries):
 369                 """Report retry in case of HTTP error 503"""
 370                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
 371
 372         def report_file_already_downloaded(self, file_name):
 373                 """Report file has already been fully downloaded."""
 374                 try:
 375                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 376                 except (UnicodeEncodeError), err:
 377                         self.to_stdout(u'[download] The file has already been downloaded')
 378
 379         def report_unable_to_resume(self):
 380                 """Report it was impossible to resume download."""
 381                 self.to_stdout(u'[download] Unable to resume')
 382
 383         def report_finish(self):
 384                 """Report download finished."""
 385                 if self.params.get('noprogress', False):
 386                         self.to_stdout(u'[download] Download completed')
 387                 else:
 388                         self.to_stdout(u'')
 389
 390         def process_info(self, info_dict):
 391                 """Process a single dictionary returned by an InfoExtractor."""
 392                 # Do nothing else if in simulate mode
 393                 if self.params.get('simulate', False):
 394                         # Verify URL if it's an HTTP one
 395                         if info_dict['url'].startswith('http'):
 396                                 try:
 397                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 398                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 399                                         raise UnavailableFormatError
 400
 401                         # Forced printings
 402                         if self.params.get('forcetitle', False):
 403                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 404                         if self.params.get('forceurl', False):
 405                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 406                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 407                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 408                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 409                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 410
 411                         return
 412
 413                 try:
 414                         template_dict = dict(info_dict)
 415                         template_dict['epoch'] = unicode(long(time.time()))
 416                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 417                         filename = self.params['outtmpl'] % template_dict
 418                 except (ValueError, KeyError), err:
 419                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 420                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 421                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 422                         return
 423
 424                 try:
 425                         self.pmkdir(filename)
 426                 except (OSError, IOError), err:
 427                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 428                         return
 429
 430                 try:
 431                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 432                 except (OSError, IOError), err:
 433                         raise UnavailableFormatError
 434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 435                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 436                         return
 437                 except (ContentTooShortError, ), err:
 438                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 439                         return
 440
 441                 if success:
 442                         try:
 443                                 self.post_process(filename, info_dict)
 444                         except (PostProcessingError), err:
 445                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 446                                 return
 447
 448         def download(self, url_list):
 449                 """Download a given list of URLs."""
 450                 if len(url_list) > 1 and self.fixed_template():
 451                         raise SameFileError(self.params['outtmpl'])
 452
 453                 for url in url_list:
 454                         suitable_found = False
 455                         for ie in self._ies:
 456                                 # Go to next InfoExtractor if not suitable
 457                                 if not ie.suitable(url):
 458                                         continue
 459
 460                                 # Suitable InfoExtractor found
 461                                 suitable_found = True
 462
 463                                 # Extract information from URL and process it
 464                                 ie.extract(url)
 465
 466                                 # Suitable InfoExtractor had been found; go to next URL
 467                                 break
 468
 469                         if not suitable_found:
 470                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 471
 472                 return self._download_retcode
 473
 474         def post_process(self, filename, ie_info):
 475                 """Run the postprocessing chain on the given file."""
 476                 info = dict(ie_info)
 477                 info['filepath'] = filename
 478                 for pp in self._pps:
 479                         info = pp.run(info)
 480                         if info is None:
 481                                 break
 482
 483         def _download_with_rtmpdump(self, filename, url, player_url):
 484                 self.report_destination(filename)
 485
 486                 # Check for rtmpdump first
 487                 try:
 488                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 489                 except (OSError, IOError):
 490                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 491                         return False
 492
 493                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 494                 # the connection was interrumpted and resuming appears to be
 495                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 496                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 497                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 498                 while retval == 2 or retval == 1:
 499                         prevsize = os.path.getsize(filename)
 500                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 501                         time.sleep(5.0) # This seems to be needed
 502                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 503                         cursize = os.path.getsize(filename)
 504                         if prevsize == cursize and retval == 1:
 505                                 break
 506                 if retval == 0:
 507                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 508                         return True
 509                 else:
 510                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 511                         return False
 512
 513         def _do_download(self, filename, url, player_url):
 514                 # Attempt to download using rtmpdump
 515                 if url.startswith('rtmp'):
 516                         return self._download_with_rtmpdump(filename, url, player_url)
 517
 518                 stream = None
 519                 open_mode = 'wb'
 520                 basic_request = urllib2.Request(url, None, std_headers)
 521                 request = urllib2.Request(url, None, std_headers)
 522
 523                 # Establish possible resume length
 524                 if os.path.isfile(filename):
 525                         resume_len = os.path.getsize(filename)
 526                 else:
 527                         resume_len = 0
 528
 529                 # Request parameters in case of being able to resume
 530                 if self.params.get('continuedl', False) and resume_len != 0:
 531                         self.report_resuming_byte(resume_len)
 532                         request.add_header('Range','bytes=%d-' % resume_len)
 533                         open_mode = 'ab'
 534
 535                 count = 0
 536                 retries = self.params.get('retries', 0)
 537                 while True:
 538                         # Establish connection
 539                         try:
 540                                 data = urllib2.urlopen(request)
 541                                 break
 542                         except (urllib2.HTTPError, ), err:
 543                                 if err.code == 503:
 544                                         # Retry in case of HTTP error 503
 545                                         count += 1
 546                                         if count <= retries:
 547                                                 self.report_retry(count, retries)
 548                                                 continue
 549                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
 550                                         raise
 551                                 # Unable to resume
 552                                 data = urllib2.urlopen(basic_request)
 553                                 content_length = data.info()['Content-Length']
 554
 555                                 if content_length is not None and long(content_length) == resume_len:
 556                                         # Because the file had already been fully downloaded
 557                                         self.report_file_already_downloaded(filename)
 558                                         return True
 559                                 else:
 560                                         # Because the server didn't let us
 561                                         self.report_unable_to_resume()
 562                                         open_mode = 'wb'
 563
 564                 data_len = data.info().get('Content-length', None)
 565                 data_len_str = self.format_bytes(data_len)
 566                 byte_counter = 0
 567                 block_size = 1024
 568                 start = time.time()
 569                 while True:
 570                         # Download and write
 571                         before = time.time()
 572                         data_block = data.read(block_size)
 573                         after = time.time()
 574                         data_block_len = len(data_block)
 575                         if data_block_len == 0:
 576                                 break
 577                         byte_counter += data_block_len
 578
 579                         # Open file just in time
 580                         if stream is None:
 581                                 try:
 582                                         (stream, filename) = sanitize_open(filename, open_mode)
 583                                         self.report_destination(filename)
 584                                         self._num_downloads += 1
 585                                 except (OSError, IOError), err:
 586                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 587                                         return False
 588                         try:
 589                                 stream.write(data_block)
 590                         except (IOError, OSError), err:
 591                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 592                         block_size = self.best_block_size(after - before, data_block_len)
 593
 594                         # Progress message
 595                         percent_str = self.calc_percent(byte_counter, data_len)
 596                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 597                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 598                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 599
 600                         # Apply rate limit
 601                         self.slow_down(start, byte_counter)
 602
 603                 self.report_finish()
 604                 if data_len is not None and str(byte_counter) != data_len:
 605                         raise ContentTooShortError(byte_counter, long(data_len))
 606                 return True
 607
 608 class InfoExtractor(object):
 609         """Information Extractor class.
 610
 611         Information extractors are the classes that, given a URL, extract
 612         information from the video (or videos) the URL refers to. This
 613         information includes the real video URL, the video title and simplified
 614         title, author and others. The information is stored in a dictionary
 615         which is then passed to the FileDownloader. The FileDownloader
 616         processes this information possibly downloading the video to the file
 617         system, among other possible outcomes. The dictionaries must include
 618         the following fields:
 619
 620         id:             Video identifier.
 621         url:            Final video URL.
 622         uploader:       Nickname of the video uploader.
 623         title:          Literal title.
 624         stitle:         Simplified title.
 625         ext:            Video filename extension.
 626         format:         Video format.
 627         player_url:     SWF Player URL (may be None).
 628
 629         The following fields are optional. Their primary purpose is to allow
 630         youtube-dl to serve as the backend for a video search function, such
 631         as the one in youtube2mp3.  They are only used when their respective
 632         forced printing functions are called:
 633
 634         thumbnail:      Full URL to a video thumbnail image.
 635         description:    One-line video description.
 636
 637         Subclasses of this one should re-define the _real_initialize() and
 638         _real_extract() methods, as well as the suitable() static method.
 639         Probably, they should also be instantiated and added to the main
 640         downloader.
 641         """
 642
 643         _ready = False
 644         _downloader = None
 645
 646         def __init__(self, downloader=None):
 647                 """Constructor. Receives an optional downloader."""
 648                 self._ready = False
 649                 self.set_downloader(downloader)
 650
 651         @staticmethod
 652         def suitable(url):
 653                 """Receives a URL and returns True if suitable for this IE."""
 654                 return False
 655
 656         def initialize(self):
 657                 """Initializes an instance (authentication, etc)."""
 658                 if not self._ready:
 659                         self._real_initialize()
 660                         self._ready = True
 661
 662         def extract(self, url):
 663                 """Extracts URL information and returns it in list of dicts."""
 664                 self.initialize()
 665                 return self._real_extract(url)
 666
 667         def set_downloader(self, downloader):
 668                 """Sets the downloader for this IE."""
 669                 self._downloader = downloader
 670
 671         def _real_initialize(self):
 672                 """Real initialization process. Redefine in subclasses."""
 673                 pass
 674
 675         def _real_extract(self, url):
 676                 """Real extraction process. Redefine in subclasses."""
 677                 pass
 678
 679 class YoutubeIE(InfoExtractor):
 680         """Information extractor for youtube.com."""
 681
 682         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 683         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 684         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 685         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 686         _NETRC_MACHINE = 'youtube'
 687         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 688         _video_extensions = {
 689                 '13': '3gp',
 690                 '17': 'mp4',
 691                 '18': 'mp4',
 692                 '22': 'mp4',
 693                 '37': 'mp4',
 694                 '43': 'webm',
 695                 '45': 'webm',
 696         }
 697
 698         @staticmethod
 699         def suitable(url):
 700                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 701
 702         def report_lang(self):
 703                 """Report attempt to set language."""
 704                 self._downloader.to_stdout(u'[youtube] Setting language')
 705
 706         def report_login(self):
 707                 """Report attempt to log in."""
 708                 self._downloader.to_stdout(u'[youtube] Logging in')
 709
 710         def report_age_confirmation(self):
 711                 """Report attempt to confirm age."""
 712                 self._downloader.to_stdout(u'[youtube] Confirming age')
 713
 714         def report_video_webpage_download(self, video_id):
 715                 """Report attempt to download video webpage."""
 716                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 717
 718         def report_video_info_webpage_download(self, video_id):
 719                 """Report attempt to download video info webpage."""
 720                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 721
 722         def report_information_extraction(self, video_id):
 723                 """Report attempt to extract video information."""
 724                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 725
 726         def report_unavailable_format(self, video_id, format):
 727                 """Report extracted video URL."""
 728                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 729
 730         def report_rtmp_download(self):
 731                 """Indicate the download will use the RTMP protocol."""
 732                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 733
 734         def _real_initialize(self):
 735                 if self._downloader is None:
 736                         return
 737
 738                 username = None
 739                 password = None
 740                 downloader_params = self._downloader.params
 741
 742                 # Attempt to use provided username and password or .netrc data
 743                 if downloader_params.get('username', None) is not None:
 744                         username = downloader_params['username']
 745                         password = downloader_params['password']
 746                 elif downloader_params.get('usenetrc', False):
 747                         try:
 748                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 749                                 if info is not None:
 750                                         username = info[0]
 751                                         password = info[2]
 752                                 else:
 753                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 754                         except (IOError, netrc.NetrcParseError), err:
 755                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 756                                 return
 757
 758                 # Set language
 759                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 760                 try:
 761                         self.report_lang()
 762                         urllib2.urlopen(request).read()
 763                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 764                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 765                         return
 766
 767                 # No authentication to be performed
 768                 if username is None:
 769                         return
 770
 771                 # Log in
 772                 login_form = {
 773                                 'current_form': 'loginForm',
 774                                 'next':         '/',
 775                                 'action_login': 'Log In',
 776                                 'username':     username,
 777                                 'password':     password,
 778                                 }
 779                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 780                 try:
 781                         self.report_login()
 782                         login_results = urllib2.urlopen(request).read()
 783                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 784                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 785                                 return
 786                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 787                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 788                         return
 789
 790                 # Confirm age
 791                 age_form = {
 792                                 'next_url':             '/',
 793                                 'action_confirm':       'Confirm',
 794                                 }
 795                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 796                 try:
 797                         self.report_age_confirmation()
 798                         age_results = urllib2.urlopen(request).read()
 799                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 800                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 801                         return
 802
 803         def _real_extract(self, url):
 804                 # Extract video id from URL
 805                 mobj = re.match(self._VALID_URL, url)
 806                 if mobj is None:
 807                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 808                         return
 809                 video_id = mobj.group(2)
 810
 811                 # Downloader parameters
 812                 best_quality = False
 813                 all_formats = False
 814                 format_param = None
 815                 quality_index = 0
 816                 if self._downloader is not None:
 817                         params = self._downloader.params
 818                         format_param = params.get('format', None)
 819                         if format_param == '0':
 820                                 format_param = self._available_formats[quality_index]
 821                                 best_quality = True
 822                         elif format_param == '-1':
 823                                 format_param = self._available_formats[quality_index]
 824                                 all_formats = True
 825
 826                 while True:
 827                         # Extension
 828                         video_extension = self._video_extensions.get(format_param, 'flv')
 829
 830                         # Get video webpage
 831                         self.report_video_webpage_download(video_id)
 832                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 833                         try:
 834                                 video_webpage = urllib2.urlopen(request).read()
 835                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 836                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 837                                 return
 838
 839                         # Attempt to extract SWF player URL
 840                         mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
 841                         if mobj is not None:
 842                                 player_url = mobj.group(1)
 843                         else:
 844                                 player_url = None
 845
 846                         # Get video info
 847                         self.report_video_info_webpage_download(video_id)
 848                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 849                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 850                                                    % (video_id, el_type))
 851                                 request = urllib2.Request(video_info_url, None, std_headers)
 852                                 try:
 853                                         video_info_webpage = urllib2.urlopen(request).read()
 854                                         video_info = parse_qs(video_info_webpage)
 855                                         if 'token' in video_info:
 856                                                 break
 857                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 858                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 859                                         return
 860                         self.report_information_extraction(video_id)
 861
 862                         # "t" param
 863                         if 'token' not in video_info:
 864                                 # Attempt to see if YouTube has issued an error message
 865                                 if 'reason' not in video_info:
 866                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 867                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 868                                         stream.write(video_info_webpage)
 869                                         stream.close()
 870                                 else:
 871                                         reason = urllib.unquote_plus(video_info['reason'][0])
 872                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 873                                 return
 874                         token = urllib.unquote_plus(video_info['token'][0])
 875                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 876                         if format_param is not None:
 877                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 878
 879                         # Check possible RTMP download
 880                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 881                                 self.report_rtmp_download()
 882                                 video_real_url = video_info['conn'][0]
 883
 884                         # uploader
 885                         if 'author' not in video_info:
 886                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 887                                 return
 888                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 889
 890                         # title
 891                         if 'title' not in video_info:
 892                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 893                                 return
 894                         video_title = urllib.unquote_plus(video_info['title'][0])
 895                         video_title = video_title.decode('utf-8')
 896                         video_title = sanitize_title(video_title)
 897
 898                         # simplified title
 899                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 900                         simple_title = simple_title.strip(ur'_')
 901
 902                         # thumbnail image
 903                         if 'thumbnail_url' not in video_info:
 904                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 905                                 video_thumbnail = ''
 906                         else:   # don't panic if we can't find it
 907                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 908
 909                         # description
 910                         video_description = 'No description available.'
 911                         if self._downloader.params.get('forcedescription', False):
 912                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 913                                 if mobj is not None:
 914                                         video_description = mobj.group(1)
 915
 916                         try:
 917                                 # Process video information
 918                                 self._downloader.process_info({
 919                                         'id':           video_id.decode('utf-8'),
 920                                         'url':          video_real_url.decode('utf-8'),
 921                                         'uploader':     video_uploader.decode('utf-8'),
 922                                         'title':        video_title,
 923                                         'stitle':       simple_title,
 924                                         'ext':          video_extension.decode('utf-8'),
 925                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 926                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 927                                         'description':  video_description.decode('utf-8'),
 928                                         'player_url':   player_url,
 929                                 })
 930
 931                                 if all_formats:
 932                                         quality_index += 1
 933                                         if quality_index == len(self._available_formats):
 934                                                 # None left to get
 935                                                 return
 936                                         else:
 937                                                 format_param = self._available_formats[quality_index]
 938                                                 continue
 939                                 return
 940
 941                         except UnavailableFormatError, err:
 942                                 if best_quality or all_formats:
 943                                         quality_index += 1
 944                                         if quality_index == len(self._available_formats):
 945                                                 # I don't ever expect this to happen
 946                                                 if not all_formats:
 947                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 948                                                 return
 949                                         else:
 950                                                 self.report_unavailable_format(video_id, format_param)
 951                                                 format_param = self._available_formats[quality_index]
 952                                                 continue
 953                                 else:
 954                                         self._downloader.trouble('ERROR: format not available for video')
 955                                         return
 956
 957
 958 class MetacafeIE(InfoExtractor):
 959         """Information Extractor for metacafe.com."""
 960
 961         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 962         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 963         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 964         _youtube_ie = None
 965
 966         def __init__(self, youtube_ie, downloader=None):
 967                 InfoExtractor.__init__(self, downloader)
 968                 self._youtube_ie = youtube_ie
 969
 970         @staticmethod
 971         def suitable(url):
 972                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 973
 974         def report_disclaimer(self):
 975                 """Report disclaimer retrieval."""
 976                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 977
 978         def report_age_confirmation(self):
 979                 """Report attempt to confirm age."""
 980                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 981
 982         def report_download_webpage(self, video_id):
 983                 """Report webpage download."""
 984                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 985
 986         def report_extraction(self, video_id):
 987                 """Report information extraction."""
 988                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 989
 990         def _real_initialize(self):
 991                 # Retrieve disclaimer
 992                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 993                 try:
 994                         self.report_disclaimer()
 995                         disclaimer = urllib2.urlopen(request).read()
 996                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 997                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 998                         return
 999
1000                 # Confirm age
1001                 disclaimer_form = {
1002                         'filters': '0',
1003                         'submit': "Continue - I'm over 18",
1004                         }
1005                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1006                 try:
1007                         self.report_age_confirmation()
1008                         disclaimer = urllib2.urlopen(request).read()
1009                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1010                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1011                         return
1012
1013         def _real_extract(self, url):
1014                 # Extract id and simplified title from URL
1015                 mobj = re.match(self._VALID_URL, url)
1016                 if mobj is None:
1017                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1018                         return
1019
1020                 video_id = mobj.group(1)
1021
1022                 # Check if video comes from YouTube
1023                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1024                 if mobj2 is not None:
1025                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1026                         return
1027
1028                 simple_title = mobj.group(2).decode('utf-8')
1029                 video_extension = 'flv'
1030
1031                 # Retrieve video webpage to extract further information
1032                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1033                 try:
1034                         self.report_download_webpage(video_id)
1035                         webpage = urllib2.urlopen(request).read()
1036                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1037                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1038                         return
1039
1040                 # Extract URL, uploader and title from webpage
1041                 self.report_extraction(video_id)
1042                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1043                 if mobj is None:
1044                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1045                         return
1046                 mediaURL = urllib.unquote(mobj.group(1))
1047
1048                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1049                 #if mobj is None:
1050                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1051                 #       return
1052                 #gdaKey = mobj.group(1)
1053                 #
1054                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1055
1056                 video_url = mediaURL
1057
1058                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1059                 if mobj is None:
1060                         self._downloader.trouble(u'ERROR: unable to extract title')
1061                         return
1062                 video_title = mobj.group(1).decode('utf-8')
1063                 video_title = sanitize_title(video_title)
1064
1065                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1066                 if mobj is None:
1067                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1068                         return
1069                 video_uploader = mobj.group(1)
1070
1071                 try:
1072                         # Process video information
1073                         self._downloader.process_info({
1074                                 'id':           video_id.decode('utf-8'),
1075                                 'url':          video_url.decode('utf-8'),
1076                                 'uploader':     video_uploader.decode('utf-8'),
1077                                 'title':        video_title,
1078                                 'stitle':       simple_title,
1079                                 'ext':          video_extension.decode('utf-8'),
1080                                 'format':       u'NA',
1081                                 'player_url':   None,
1082                         })
1083                 except UnavailableFormatError:
1084                         self._downloader.trouble(u'ERROR: format not available for video')
1085
1086
1087 class GoogleIE(InfoExtractor):
1088         """Information extractor for video.google.com."""
1089
1090         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1091
1092         def __init__(self, downloader=None):
1093                 InfoExtractor.__init__(self, downloader)
1094
1095         @staticmethod
1096         def suitable(url):
1097                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1098
1099         def report_download_webpage(self, video_id):
1100                 """Report webpage download."""
1101                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1102
1103         def report_extraction(self, video_id):
1104                 """Report information extraction."""
1105                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1106
1107         def _real_initialize(self):
1108                 return
1109
1110         def _real_extract(self, url):
1111                 # Extract id from URL
1112                 mobj = re.match(self._VALID_URL, url)
1113                 if mobj is None:
1114                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1115                         return
1116
1117                 video_id = mobj.group(1)
1118
1119                 video_extension = 'mp4'
1120
1121                 # Retrieve video webpage to extract further information
1122                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1123                 try:
1124                         self.report_download_webpage(video_id)
1125                         webpage = urllib2.urlopen(request).read()
1126                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1127                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1128                         return
1129
1130                 # Extract URL, uploader, and title from webpage
1131                 self.report_extraction(video_id)
1132                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1133                 if mobj is None:
1134                         video_extension = 'flv'
1135                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1136                 if mobj is None:
1137                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1138                         return
1139                 mediaURL = urllib.unquote(mobj.group(1))
1140                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1141                 mediaURL = mediaURL.replace('\\x26', '\x26')
1142
1143                 video_url = mediaURL
1144
1145                 mobj = re.search(r'<title>(.*)</title>', webpage)
1146                 if mobj is None:
1147                         self._downloader.trouble(u'ERROR: unable to extract title')
1148                         return
1149                 video_title = mobj.group(1).decode('utf-8')
1150                 video_title = sanitize_title(video_title)
1151                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1152
1153                 # Extract video description
1154                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1155                 if mobj is None:
1156                         self._downloader.trouble(u'ERROR: unable to extract video description')
1157                         return
1158                 video_description = mobj.group(1).decode('utf-8')
1159                 if not video_description:
1160                         video_description = 'No description available.'
1161
1162                 # Extract video thumbnail
1163                 if self._downloader.params.get('forcethumbnail', False):
1164                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1165                         try:
1166                                 webpage = urllib2.urlopen(request).read()
1167                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1168                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1169                                 return
1170                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1171                         if mobj is None:
1172                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1173                                 return
1174                         video_thumbnail = mobj.group(1)
1175                 else:   # we need something to pass to process_info
1176                         video_thumbnail = ''
1177
1178
1179                 try:
1180                         # Process video information
1181                         self._downloader.process_info({
1182                                 'id':           video_id.decode('utf-8'),
1183                                 'url':          video_url.decode('utf-8'),
1184                                 'uploader':     u'NA',
1185                                 'title':        video_title,
1186                                 'stitle':       simple_title,
1187                                 'ext':          video_extension.decode('utf-8'),
1188                                 'format':       u'NA',
1189                                 'player_url':   None,
1190                         })
1191                 except UnavailableFormatError:
1192                         self._downloader.trouble(u'ERROR: format not available for video')
1193
1194
1195 class PhotobucketIE(InfoExtractor):
1196         """Information extractor for photobucket.com."""
1197
1198         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1199
1200         def __init__(self, downloader=None):
1201                 InfoExtractor.__init__(self, downloader)
1202
1203         @staticmethod
1204         def suitable(url):
1205                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1206
1207         def report_download_webpage(self, video_id):
1208                 """Report webpage download."""
1209                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1210
1211         def report_extraction(self, video_id):
1212                 """Report information extraction."""
1213                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1214
1215         def _real_initialize(self):
1216                 return
1217
1218         def _real_extract(self, url):
1219                 # Extract id from URL
1220                 mobj = re.match(self._VALID_URL, url)
1221                 if mobj is None:
1222                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1223                         return
1224
1225                 video_id = mobj.group(1)
1226
1227                 video_extension = 'flv'
1228
1229                 # Retrieve video webpage to extract further information
1230                 request = urllib2.Request(url)
1231                 try:
1232                         self.report_download_webpage(video_id)
1233                         webpage = urllib2.urlopen(request).read()
1234                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1235                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1236                         return
1237
1238                 # Extract URL, uploader, and title from webpage
1239                 self.report_extraction(video_id)
1240                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1241                 if mobj is None:
1242                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1243                         return
1244                 mediaURL = urllib.unquote(mobj.group(1))
1245
1246                 video_url = mediaURL
1247
1248                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1249                 if mobj is None:
1250                         self._downloader.trouble(u'ERROR: unable to extract title')
1251                         return
1252                 video_title = mobj.group(1).decode('utf-8')
1253                 video_title = sanitize_title(video_title)
1254                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1255
1256                 video_uploader = mobj.group(2).decode('utf-8')
1257
1258                 try:
1259                         # Process video information
1260                         self._downloader.process_info({
1261                                 'id':           video_id.decode('utf-8'),
1262                                 'url':          video_url.decode('utf-8'),
1263                                 'uploader':     video_uploader,
1264                                 'title':        video_title,
1265                                 'stitle':       simple_title,
1266                                 'ext':          video_extension.decode('utf-8'),
1267                                 'format':       u'NA',
1268                                 'player_url':   None,
1269                         })
1270                 except UnavailableFormatError:
1271                         self._downloader.trouble(u'ERROR: format not available for video')
1272
1273
1274 class YahooIE(InfoExtractor):
1275         """Information extractor for video.yahoo.com."""
1276
1277         # _VALID_URL matches all Yahoo! Video URLs
1278         # _VPAGE_URL matches only the extractable '/watch/' URLs
1279         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1280         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1281
1282         def __init__(self, downloader=None):
1283                 InfoExtractor.__init__(self, downloader)
1284
1285         @staticmethod
1286         def suitable(url):
1287                 return (re.match(YahooIE._VALID_URL, url) is not None)
1288
1289         def report_download_webpage(self, video_id):
1290                 """Report webpage download."""
1291                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1292
1293         def report_extraction(self, video_id):
1294                 """Report information extraction."""
1295                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1296
1297         def _real_initialize(self):
1298                 return
1299
1300         def _real_extract(self, url):
1301                 # Extract ID from URL
1302                 mobj = re.match(self._VALID_URL, url)
1303                 if mobj is None:
1304                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1305                         return
1306
1307                 video_id = mobj.group(2)
1308                 video_extension = 'flv'
1309
1310                 # Rewrite valid but non-extractable URLs as
1311                 # extractable English language /watch/ URLs
1312                 if re.match(self._VPAGE_URL, url) is None:
1313                         request = urllib2.Request(url)
1314                         try:
1315                                 webpage = urllib2.urlopen(request).read()
1316                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1317                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1318                                 return
1319
1320                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1321                         if mobj is None:
1322                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1323                                 return
1324                         yahoo_id = mobj.group(1)
1325
1326                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1327                         if mobj is None:
1328                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1329                                 return
1330                         yahoo_vid = mobj.group(1)
1331
1332                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1333                         return self._real_extract(url)
1334
1335                 # Retrieve video webpage to extract further information
1336                 request = urllib2.Request(url)
1337                 try:
1338                         self.report_download_webpage(video_id)
1339                         webpage = urllib2.urlopen(request).read()
1340                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1341                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1342                         return
1343
1344                 # Extract uploader and title from webpage
1345                 self.report_extraction(video_id)
1346                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1347                 if mobj is None:
1348                         self._downloader.trouble(u'ERROR: unable to extract video title')
1349                         return
1350                 video_title = mobj.group(1).decode('utf-8')
1351                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1352
1353                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1354                 if mobj is None:
1355                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1356                         return
1357                 video_uploader = mobj.group(1).decode('utf-8')
1358
1359                 # Extract video thumbnail
1360                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1361                 if mobj is None:
1362                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1363                         return
1364                 video_thumbnail = mobj.group(1).decode('utf-8')
1365
1366                 # Extract video description
1367                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1368                 if mobj is None:
1369                         self._downloader.trouble(u'ERROR: unable to extract video description')
1370                         return
1371                 video_description = mobj.group(1).decode('utf-8')
1372                 if not video_description: video_description = 'No description available.'
1373
1374                 # Extract video height and width
1375                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1376                 if mobj is None:
1377                         self._downloader.trouble(u'ERROR: unable to extract video height')
1378                         return
1379                 yv_video_height = mobj.group(1)
1380
1381                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1382                 if mobj is None:
1383                         self._downloader.trouble(u'ERROR: unable to extract video width')
1384                         return
1385                 yv_video_width = mobj.group(1)
1386
1387                 # Retrieve video playlist to extract media URL
1388                 # I'm not completely sure what all these options are, but we
1389                 # seem to need most of them, otherwise the server sends a 401.
1390                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1391                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1392                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1393                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1394                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1395                 try:
1396                         self.report_download_webpage(video_id)
1397                         webpage = urllib2.urlopen(request).read()
1398                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1399                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1400                         return
1401
1402                 # Extract media URL from playlist XML
1403                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1404                 if mobj is None:
1405                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1406                         return
1407                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1408                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1409
1410                 try:
1411                         # Process video information
1412                         self._downloader.process_info({
1413                                 'id':           video_id.decode('utf-8'),
1414                                 'url':          video_url,
1415                                 'uploader':     video_uploader,
1416                                 'title':        video_title,
1417                                 'stitle':       simple_title,
1418                                 'ext':          video_extension.decode('utf-8'),
1419                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1420                                 'description':  video_description,
1421                                 'thumbnail':    video_thumbnail,
1422                                 'description':  video_description,
1423                                 'player_url':   None,
1424                         })
1425                 except UnavailableFormatError:
1426                         self._downloader.trouble(u'ERROR: format not available for video')
1427
1428
1429 class GenericIE(InfoExtractor):
1430         """Generic last-resort information extractor."""
1431
1432         def __init__(self, downloader=None):
1433                 InfoExtractor.__init__(self, downloader)
1434
1435         @staticmethod
1436         def suitable(url):
1437                 return True
1438
1439         def report_download_webpage(self, video_id):
1440                 """Report webpage download."""
1441                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1442                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1443
1444         def report_extraction(self, video_id):
1445                 """Report information extraction."""
1446                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1447
1448         def _real_initialize(self):
1449                 return
1450
1451         def _real_extract(self, url):
1452                 video_id = url.split('/')[-1]
1453                 request = urllib2.Request(url)
1454                 try:
1455                         self.report_download_webpage(video_id)
1456                         webpage = urllib2.urlopen(request).read()
1457                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1458                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1459                         return
1460                 except ValueError, err:
1461                         # since this is the last-resort InfoExtractor, if
1462                         # this error is thrown, it'll be thrown here
1463                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1464                         return
1465
1466                 # Start with something easy: JW Player in SWFObject
1467                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1468                 if mobj is None:
1469                         # Broaden the search a little bit
1470                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1471                 if mobj is None:
1472                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1473                         return
1474
1475                 # It's possible that one of the regexes
1476                 # matched, but returned an empty group:
1477                 if mobj.group(1) is None:
1478                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1479                         return
1480
1481                 video_url = urllib.unquote(mobj.group(1))
1482                 video_id  = os.path.basename(video_url)
1483
1484                 # here's a fun little line of code for you:
1485                 video_extension = os.path.splitext(video_id)[1][1:]
1486                 video_id        = os.path.splitext(video_id)[0]
1487
1488                 # it's tempting to parse this further, but you would
1489                 # have to take into account all the variations like
1490                 #   Video Title - Site Name
1491                 #   Site Name | Video Title
1492                 #   Video Title - Tagline | Site Name
1493                 # and so on and so forth; it's just not practical
1494                 mobj = re.search(r'<title>(.*)</title>', webpage)
1495                 if mobj is None:
1496                         self._downloader.trouble(u'ERROR: unable to extract title')
1497                         return
1498                 video_title = mobj.group(1).decode('utf-8')
1499                 video_title = sanitize_title(video_title)
1500                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1501
1502                 # video uploader is domain name
1503                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1504                 if mobj is None:
1505                         self._downloader.trouble(u'ERROR: unable to extract title')
1506                         return
1507                 video_uploader = mobj.group(1).decode('utf-8')
1508
1509                 try:
1510                         # Process video information
1511                         self._downloader.process_info({
1512                                 'id':           video_id.decode('utf-8'),
1513                                 'url':          video_url.decode('utf-8'),
1514                                 'uploader':     video_uploader,
1515                                 'title':        video_title,
1516                                 'stitle':       simple_title,
1517                                 'ext':          video_extension.decode('utf-8'),
1518                                 'format':       u'NA',
1519                                 'player_url':   None,
1520                         })
1521                 except UnavailableFormatError:
1522                         self._downloader.trouble(u'ERROR: format not available for video')
1523
1524
1525 class YoutubeSearchIE(InfoExtractor):
1526         """Information Extractor for YouTube search queries."""
1527         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1528         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1529         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1530         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1531         _youtube_ie = None
1532         _max_youtube_results = 1000
1533
1534         def __init__(self, youtube_ie, downloader=None):
1535                 InfoExtractor.__init__(self, downloader)
1536                 self._youtube_ie = youtube_ie
1537
1538         @staticmethod
1539         def suitable(url):
1540                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1541
1542         def report_download_page(self, query, pagenum):
1543                 """Report attempt to download playlist page with given number."""
1544                 query = query.decode(preferredencoding())
1545                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1546
1547         def _real_initialize(self):
1548                 self._youtube_ie.initialize()
1549
1550         def _real_extract(self, query):
1551                 mobj = re.match(self._VALID_QUERY, query)
1552                 if mobj is None:
1553                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1554                         return
1555
1556                 prefix, query = query.split(':')
1557                 prefix = prefix[8:]
1558                 query  = query.encode('utf-8')
1559                 if prefix == '':
1560                         self._download_n_results(query, 1)
1561                         return
1562                 elif prefix == 'all':
1563                         self._download_n_results(query, self._max_youtube_results)
1564                         return
1565                 else:
1566                         try:
1567                                 n = long(prefix)
1568                                 if n <= 0:
1569                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1570                                         return
1571                                 elif n > self._max_youtube_results:
1572                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1573                                         n = self._max_youtube_results
1574                                 self._download_n_results(query, n)
1575                                 return
1576                         except ValueError: # parsing prefix as integer fails
1577                                 self._download_n_results(query, 1)
1578                                 return
1579
1580         def _download_n_results(self, query, n):
1581                 """Downloads a specified number of results for a query"""
1582
1583                 video_ids = []
1584                 already_seen = set()
1585                 pagenum = 1
1586
1587                 while True:
1588                         self.report_download_page(query, pagenum)
1589                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1590                         request = urllib2.Request(result_url, None, std_headers)
1591                         try:
1592                                 page = urllib2.urlopen(request).read()
1593                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1595                                 return
1596
1597                         # Extract video identifiers
1598                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1599                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1600                                 if video_id not in already_seen:
1601                                         video_ids.append(video_id)
1602                                         already_seen.add(video_id)
1603                                         if len(video_ids) == n:
1604                                                 # Specified n videos reached
1605                                                 for id in video_ids:
1606                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1607                                                 return
1608
1609                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1610                                 for id in video_ids:
1611                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1612                                 return
1613
1614                         pagenum = pagenum + 1
1615
1616 class GoogleSearchIE(InfoExtractor):
1617         """Information Extractor for Google Video search queries."""
1618         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1619         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1620         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1621         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1622         _google_ie = None
1623         _max_google_results = 1000
1624
1625         def __init__(self, google_ie, downloader=None):
1626                 InfoExtractor.__init__(self, downloader)
1627                 self._google_ie = google_ie
1628
1629         @staticmethod
1630         def suitable(url):
1631                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1632
1633         def report_download_page(self, query, pagenum):
1634                 """Report attempt to download playlist page with given number."""
1635                 query = query.decode(preferredencoding())
1636                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1637
1638         def _real_initialize(self):
1639                 self._google_ie.initialize()
1640
1641         def _real_extract(self, query):
1642                 mobj = re.match(self._VALID_QUERY, query)
1643                 if mobj is None:
1644                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1645                         return
1646
1647                 prefix, query = query.split(':')
1648                 prefix = prefix[8:]
1649                 query  = query.encode('utf-8')
1650                 if prefix == '':
1651                         self._download_n_results(query, 1)
1652                         return
1653                 elif prefix == 'all':
1654                         self._download_n_results(query, self._max_google_results)
1655                         return
1656                 else:
1657                         try:
1658                                 n = long(prefix)
1659                                 if n <= 0:
1660                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1661                                         return
1662                                 elif n > self._max_google_results:
1663                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1664                                         n = self._max_google_results
1665                                 self._download_n_results(query, n)
1666                                 return
1667                         except ValueError: # parsing prefix as integer fails
1668                                 self._download_n_results(query, 1)
1669                                 return
1670
1671         def _download_n_results(self, query, n):
1672                 """Downloads a specified number of results for a query"""
1673
1674                 video_ids = []
1675                 already_seen = set()
1676                 pagenum = 1
1677
1678                 while True:
1679                         self.report_download_page(query, pagenum)
1680                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1681                         request = urllib2.Request(result_url, None, std_headers)
1682                         try:
1683                                 page = urllib2.urlopen(request).read()
1684                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1685                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1686                                 return
1687
1688                         # Extract video identifiers
1689                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1690                                 video_id = mobj.group(1)
1691                                 if video_id not in already_seen:
1692                                         video_ids.append(video_id)
1693                                         already_seen.add(video_id)
1694                                         if len(video_ids) == n:
1695                                                 # Specified n videos reached
1696                                                 for id in video_ids:
1697                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1698                                                 return
1699
1700                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1701                                 for id in video_ids:
1702                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1703                                 return
1704
1705                         pagenum = pagenum + 1
1706
1707 class YahooSearchIE(InfoExtractor):
1708         """Information Extractor for Yahoo! Video search queries."""
1709         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1710         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1711         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1712         _MORE_PAGES_INDICATOR = r'\s*Next'
1713         _yahoo_ie = None
1714         _max_yahoo_results = 1000
1715
1716         def __init__(self, yahoo_ie, downloader=None):
1717                 InfoExtractor.__init__(self, downloader)
1718                 self._yahoo_ie = yahoo_ie
1719
1720         @staticmethod
1721         def suitable(url):
1722                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1723
1724         def report_download_page(self, query, pagenum):
1725                 """Report attempt to download playlist page with given number."""
1726                 query = query.decode(preferredencoding())
1727                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1728
1729         def _real_initialize(self):
1730                 self._yahoo_ie.initialize()
1731
1732         def _real_extract(self, query):
1733                 mobj = re.match(self._VALID_QUERY, query)
1734                 if mobj is None:
1735                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1736                         return
1737
1738                 prefix, query = query.split(':')
1739                 prefix = prefix[8:]
1740                 query  = query.encode('utf-8')
1741                 if prefix == '':
1742                         self._download_n_results(query, 1)
1743                         return
1744                 elif prefix == 'all':
1745                         self._download_n_results(query, self._max_yahoo_results)
1746                         return
1747                 else:
1748                         try:
1749                                 n = long(prefix)
1750                                 if n <= 0:
1751                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1752                                         return
1753                                 elif n > self._max_yahoo_results:
1754                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1755                                         n = self._max_yahoo_results
1756                                 self._download_n_results(query, n)
1757                                 return
1758                         except ValueError: # parsing prefix as integer fails
1759                                 self._download_n_results(query, 1)
1760                                 return
1761
1762         def _download_n_results(self, query, n):
1763                 """Downloads a specified number of results for a query"""
1764
1765                 video_ids = []
1766                 already_seen = set()
1767                 pagenum = 1
1768
1769                 while True:
1770                         self.report_download_page(query, pagenum)
1771                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1772                         request = urllib2.Request(result_url, None, std_headers)
1773                         try:
1774                                 page = urllib2.urlopen(request).read()
1775                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1777                                 return
1778
1779                         # Extract video identifiers
1780                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1781                                 video_id = mobj.group(1)
1782                                 if video_id not in already_seen:
1783                                         video_ids.append(video_id)
1784                                         already_seen.add(video_id)
1785                                         if len(video_ids) == n:
1786                                                 # Specified n videos reached
1787                                                 for id in video_ids:
1788                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1789                                                 return
1790
1791                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1792                                 for id in video_ids:
1793                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1794                                 return
1795
1796                         pagenum = pagenum + 1
1797
1798 class YoutubePlaylistIE(InfoExtractor):
1799         """Information Extractor for YouTube playlists."""
1800
1801         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1802         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1803         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1804         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1805         _youtube_ie = None
1806
1807         def __init__(self, youtube_ie, downloader=None):
1808                 InfoExtractor.__init__(self, downloader)
1809                 self._youtube_ie = youtube_ie
1810
1811         @staticmethod
1812         def suitable(url):
1813                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1814
1815         def report_download_page(self, playlist_id, pagenum):
1816                 """Report attempt to download playlist page with given number."""
1817                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1818
1819         def _real_initialize(self):
1820                 self._youtube_ie.initialize()
1821
1822         def _real_extract(self, url):
1823                 # Extract playlist id
1824                 mobj = re.match(self._VALID_URL, url)
1825                 if mobj is None:
1826                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1827                         return
1828
1829                 # Download playlist pages
1830                 playlist_id = mobj.group(1)
1831                 video_ids = []
1832                 pagenum = 1
1833
1834                 while True:
1835                         self.report_download_page(playlist_id, pagenum)
1836                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1837                         try:
1838                                 page = urllib2.urlopen(request).read()
1839                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1840                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1841                                 return
1842
1843                         # Extract video identifiers
1844                         ids_in_page = []
1845                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1846                                 if mobj.group(1) not in ids_in_page:
1847                                         ids_in_page.append(mobj.group(1))
1848                         video_ids.extend(ids_in_page)
1849
1850                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1851                                 break
1852                         pagenum = pagenum + 1
1853
1854                 for id in video_ids:
1855                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1856                 return
1857
1858 class YoutubeUserIE(InfoExtractor):
1859         """Information Extractor for YouTube users."""
1860
1861         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1862         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1863         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1864         _youtube_ie = None
1865
1866         def __init__(self, youtube_ie, downloader=None):
1867                 InfoExtractor.__init__(self, downloader)
1868                 self._youtube_ie = youtube_ie
1869
1870         @staticmethod
1871         def suitable(url):
1872                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1873
1874         def report_download_page(self, username):
1875                 """Report attempt to download user page."""
1876                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1877
1878         def _real_initialize(self):
1879                 self._youtube_ie.initialize()
1880
1881         def _real_extract(self, url):
1882                 # Extract username
1883                 mobj = re.match(self._VALID_URL, url)
1884                 if mobj is None:
1885                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1886                         return
1887
1888                 # Download user page
1889                 username = mobj.group(1)
1890                 video_ids = []
1891                 pagenum = 1
1892
1893                 self.report_download_page(username)
1894                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1895                 try:
1896                         page = urllib2.urlopen(request).read()
1897                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1898                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1899                         return
1900
1901                 # Extract video identifiers
1902                 ids_in_page = []
1903
1904                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1905                         if mobj.group(1) not in ids_in_page:
1906                                 ids_in_page.append(mobj.group(1))
1907                 video_ids.extend(ids_in_page)
1908
1909                 for id in video_ids:
1910                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1911                 return
1912
1913 class PostProcessor(object):
1914         """Post Processor class.
1915
1916         PostProcessor objects can be added to downloaders with their
1917         add_post_processor() method. When the downloader has finished a
1918         successful download, it will take its internal chain of PostProcessors
1919         and start calling the run() method on each one of them, first with
1920         an initial argument and then with the returned value of the previous
1921         PostProcessor.
1922
1923         The chain will be stopped if one of them ever returns None or the end
1924         of the chain is reached.
1925
1926         PostProcessor objects follow a "mutual registration" process similar
1927         to InfoExtractor objects.
1928         """
1929
1930         _downloader = None
1931
1932         def __init__(self, downloader=None):
1933                 self._downloader = downloader
1934
1935         def set_downloader(self, downloader):
1936                 """Sets the downloader for this PP."""
1937                 self._downloader = downloader
1938
1939         def run(self, information):
1940                 """Run the PostProcessor.
1941
1942                 The "information" argument is a dictionary like the ones
1943                 composed by InfoExtractors. The only difference is that this
1944                 one has an extra field called "filepath" that points to the
1945                 downloaded file.
1946
1947                 When this method returns None, the postprocessing chain is
1948                 stopped. However, this method may return an information
1949                 dictionary that will be passed to the next postprocessing
1950                 object in the chain. It can be the one it received after
1951                 changing some fields.
1952
1953                 In addition, this method may raise a PostProcessingError
1954                 exception that will be taken into account by the downloader
1955                 it was called from.
1956                 """
1957                 return information # by default, do nothing
1958
1959 ### MAIN PROGRAM ###
1960 if __name__ == '__main__':
1961         try:
1962                 # Modules needed only when running the main program
1963                 import getpass
1964                 import optparse
1965
1966                 # Function to update the program file with the latest version from bitbucket.org
1967                 def update_self(downloader, filename):
1968                         # Note: downloader only used for options
1969                         if not os.access (filename, os.W_OK):
1970                                 sys.exit('ERROR: no write permissions on %s' % filename)
1971
1972                         downloader.to_stdout('Updating to latest stable version...')
1973                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1974                         latest_version = urllib.urlopen(latest_url).read().strip()
1975                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1976                         newcontent = urllib.urlopen(prog_url).read()
1977                         stream = open(filename, 'w')
1978                         stream.write(newcontent)
1979                         stream.close()
1980                         downloader.to_stdout('Updated to version %s' % latest_version)
1981
1982                 # General configuration
1983                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1984                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1985                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1986
1987                 # Parse command line
1988                 parser = optparse.OptionParser(
1989                         usage='Usage: %prog [options] url...',
1990                         version='2010.06.06',
1991                         conflict_handler='resolve',
1992                 )
1993
1994                 parser.add_option('-h', '--help',
1995                                 action='help', help='print this help text and exit')
1996                 parser.add_option('-v', '--version',
1997                                 action='version', help='print program version and exit')
1998                 parser.add_option('-U', '--update',
1999                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2000                 parser.add_option('-i', '--ignore-errors',
2001                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2002                 parser.add_option('-r', '--rate-limit',
2003                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
2004                 parser.add_option('-R', '--retries',
2005                                 dest='retries', metavar='T', help='number of retries (default is 10)', default=10)
2006
2007                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2008                 authentication.add_option('-u', '--username',
2009                                 dest='username', metavar='UN', help='account username')
2010                 authentication.add_option('-p', '--password',
2011                                 dest='password', metavar='PW', help='account password')
2012                 authentication.add_option('-n', '--netrc',
2013                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2014                 parser.add_option_group(authentication)
2015
2016                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2017                 video_format.add_option('-f', '--format',
2018                                 action='store', dest='format', metavar='FMT', help='video format code')
2019                 video_format.add_option('-b', '--best-quality',
2020                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
2021                 video_format.add_option('-m', '--mobile-version',
2022                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2023                 video_format.add_option('-d', '--high-def',
2024                                 action='store_const', dest='format', help='alias for -f 22', const='22')
2025                 video_format.add_option('--all-formats',
2026                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2027                 parser.add_option_group(video_format)
2028
2029                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2030                 verbosity.add_option('-q', '--quiet',
2031                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2032                 verbosity.add_option('-s', '--simulate',
2033                                 action='store_true', dest='simulate', help='do not download video', default=False)
2034                 verbosity.add_option('-g', '--get-url',
2035                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2036                 verbosity.add_option('-e', '--get-title',
2037                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2038                 verbosity.add_option('--get-thumbnail',
2039                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2040                 verbosity.add_option('--get-description',
2041                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2042                 verbosity.add_option('--no-progress',
2043                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2044                 parser.add_option_group(verbosity)
2045
2046                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2047                 filesystem.add_option('-t', '--title',
2048                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2049                 filesystem.add_option('-l', '--literal',
2050                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2051                 filesystem.add_option('-o', '--output',
2052                                 dest='outtmpl', metavar='TPL', help='output filename template')
2053                 filesystem.add_option('-a', '--batch-file',
2054                                 dest='batchfile', metavar='F', help='file containing URLs to download (\'-\' for stdin)')
2055                 filesystem.add_option('-w', '--no-overwrites',
2056                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2057                 filesystem.add_option('-c', '--continue',
2058                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2059                 parser.add_option_group(filesystem)
2060
2061                 (opts, args) = parser.parse_args()
2062
2063                 # Batch file verification
2064                 batchurls = []
2065                 if opts.batchfile is not None:
2066                         try:
2067                                 if opts.batchfile == '-':
2068                                         batchfd = sys.stdin
2069                                 else:
2070                                         batchfd = open(opts.batchfile, 'r')
2071                                 batchurls = batchfd.readlines()
2072                                 batchurls = [x.strip() for x in batchurls]
2073                                 batchurls = [x for x in batchurls if len(x) > 0]
2074                         except IOError:
2075                                 sys.exit(u'ERROR: batch file could not be read')
2076                 all_urls = batchurls + args
2077
2078                 # Conflicting, missing and erroneous options
2079                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2080                         parser.error(u'using .netrc conflicts with giving username/password')
2081                 if opts.password is not None and opts.username is None:
2082                         parser.error(u'account username missing')
2083                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2084                         parser.error(u'using output template conflicts with using title or literal title')
2085                 if opts.usetitle and opts.useliteral:
2086                         parser.error(u'using title conflicts with using literal title')
2087                 if opts.username is not None and opts.password is None:
2088                         opts.password = getpass.getpass(u'Type account password and press return:')
2089                 if opts.ratelimit is not None:
2090                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2091                         if numeric_limit is None:
2092                                 parser.error(u'invalid rate limit specified')
2093                         opts.ratelimit = numeric_limit
2094                 if opts.retries is not None:
2095                         try:
2096                                 opts.retries = long(opts.retries)
2097                         except (TypeError, ValueError), err:
2098                                 parser.error(u'invalid retry count specified')
2099
2100                 # Information extractors
2101                 youtube_ie = YoutubeIE()
2102                 metacafe_ie = MetacafeIE(youtube_ie)
2103                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2104                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2105                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2106                 google_ie = GoogleIE()
2107                 google_search_ie = GoogleSearchIE(google_ie)
2108                 photobucket_ie = PhotobucketIE()
2109                 yahoo_ie = YahooIE()
2110                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2111                 generic_ie = GenericIE()
2112
2113                 # File downloader
2114                 fd = FileDownloader({
2115                         'usenetrc': opts.usenetrc,
2116                         'username': opts.username,
2117                         'password': opts.password,
2118                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2119                         'forceurl': opts.geturl,
2120                         'forcetitle': opts.gettitle,
2121                         'forcethumbnail': opts.getthumbnail,
2122                         'forcedescription': opts.getdescription,
2123                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2124                         'format': opts.format,
2125                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2126                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2127                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2128                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2129                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2130                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2131                                 or u'%(id)s.%(ext)s'),
2132                         'ignoreerrors': opts.ignoreerrors,
2133                         'ratelimit': opts.ratelimit,
2134                         'nooverwrites': opts.nooverwrites,
2135                         'retries': opts.retries,
2136                         'continuedl': opts.continue_dl,
2137                         'noprogress': opts.noprogress,
2138                         })
2139                 fd.add_info_extractor(youtube_search_ie)
2140                 fd.add_info_extractor(youtube_pl_ie)
2141                 fd.add_info_extractor(youtube_user_ie)
2142                 fd.add_info_extractor(metacafe_ie)
2143                 fd.add_info_extractor(youtube_ie)
2144                 fd.add_info_extractor(google_ie)
2145                 fd.add_info_extractor(google_search_ie)
2146                 fd.add_info_extractor(photobucket_ie)
2147                 fd.add_info_extractor(yahoo_ie)
2148                 fd.add_info_extractor(yahoo_search_ie)
2149
2150                 # This must come last since it's the
2151                 # fallback if none of the others work
2152                 fd.add_info_extractor(generic_ie)
2153
2154                 # Update version
2155                 if opts.update_self:
2156                         update_self(fd, sys.argv[0])
2157
2158                 # Maybe do nothing
2159                 if len(all_urls) < 1:
2160                         if not opts.update_self:
2161                                 parser.error(u'you must provide at least one URL')
2162                         else:
2163                                 sys.exit()
2164                 retcode = fd.download(all_urls)
2165                 sys.exit(retcode)
2166
2167         except DownloadError:
2168                 sys.exit(1)
2169         except SameFileError:
2170                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2171         except KeyboardInterrupt:
2172                 sys.exit(u'\nERROR: Interrupted by user')