_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         retries:        Number of times to retry for HTTP error 503
 197         continuedl:     Try to continue downloads if possible.
 198         noprogress:     Do not print the progress bar.
 199         """
 200
 201         params = None
 202         _ies = []
 203         _pps = []
 204         _download_retcode = None
 205         _num_downloads = None
 206
 207         def __init__(self, params):
 208                 """Create a FileDownloader object with the given options."""
 209                 self._ies = []
 210                 self._pps = []
 211                 self._download_retcode = 0
 212                 self._num_downloads = 0
 213                 self.params = params
 214
 215         @staticmethod
 216         def pmkdir(filename):
 217                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 218                 components = filename.split(os.sep)
 219                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 220                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 221                 for dir in aggregate:
 222                         if not os.path.exists(dir):
 223                                 os.mkdir(dir)
 224
 225         @staticmethod
 226         def format_bytes(bytes):
 227                 if bytes is None:
 228                         return 'N/A'
 229                 if type(bytes) is str:
 230                         bytes = float(bytes)
 231                 if bytes == 0.0:
 232                         exponent = 0
 233                 else:
 234                         exponent = long(math.log(bytes, 1024.0))
 235                 suffix = 'bkMGTPEZY'[exponent]
 236                 converted = float(bytes) / float(1024**exponent)
 237                 return '%.2f%s' % (converted, suffix)
 238
 239         @staticmethod
 240         def calc_percent(byte_counter, data_len):
 241                 if data_len is None:
 242                         return '---.-%'
 243                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 244
 245         @staticmethod
 246         def calc_eta(start, now, total, current):
 247                 if total is None:
 248                         return '--:--'
 249                 dif = now - start
 250                 if current == 0 or dif < 0.001: # One millisecond
 251                         return '--:--'
 252                 rate = float(current) / dif
 253                 eta = long((float(total) - float(current)) / rate)
 254                 (eta_mins, eta_secs) = divmod(eta, 60)
 255                 if eta_mins > 99:
 256                         return '--:--'
 257                 return '%02d:%02d' % (eta_mins, eta_secs)
 258
 259         @staticmethod
 260         def calc_speed(start, now, bytes):
 261                 dif = now - start
 262                 if bytes == 0 or dif < 0.001: # One millisecond
 263                         return '%10s' % '---b/s'
 264                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 265
 266         @staticmethod
 267         def best_block_size(elapsed_time, bytes):
 268                 new_min = max(bytes / 2.0, 1.0)
 269                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 270                 if elapsed_time < 0.001:
 271                         return long(new_max)
 272                 rate = bytes / elapsed_time
 273                 if rate > new_max:
 274                         return long(new_max)
 275                 if rate < new_min:
 276                         return long(new_min)
 277                 return long(rate)
 278
 279         @staticmethod
 280         def parse_bytes(bytestr):
 281                 """Parse a string indicating a byte quantity into a long integer."""
 282                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 283                 if matchobj is None:
 284                         return None
 285                 number = float(matchobj.group(1))
 286                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 287                 return long(round(number * multiplier))
 288
 289         @staticmethod
 290         def verify_url(url):
 291                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 292                 request = urllib2.Request(url, None, std_headers)
 293                 data = urllib2.urlopen(request)
 294                 data.read(1)
 295                 url = data.geturl()
 296                 data.close()
 297                 return url
 298
 299         def add_info_extractor(self, ie):
 300                 """Add an InfoExtractor object to the end of the list."""
 301                 self._ies.append(ie)
 302                 ie.set_downloader(self)
 303
 304         def add_post_processor(self, pp):
 305                 """Add a PostProcessor object to the end of the chain."""
 306                 self._pps.append(pp)
 307                 pp.set_downloader(self)
 308
 309         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 310                 """Print message to stdout if not in quiet mode."""
 311                 try:
 312                         if not self.params.get('quiet', False):
 313                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 314                         sys.stdout.flush()
 315                 except (UnicodeEncodeError), err:
 316                         if not ignore_encoding_errors:
 317                                 raise
 318
 319         def to_stderr(self, message):
 320                 """Print message to stderr."""
 321                 print >>sys.stderr, message.encode(preferredencoding())
 322
 323         def fixed_template(self):
 324                 """Checks if the output template is fixed."""
 325                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 326
 327         def trouble(self, message=None):
 328                 """Determine action to take when a download problem appears.
 329
 330                 Depending on if the downloader has been configured to ignore
 331                 download errors or not, this method may throw an exception or
 332                 not when errors are found, after printing the message.
 333                 """
 334                 if message is not None:
 335                         self.to_stderr(message)
 336                 if not self.params.get('ignoreerrors', False):
 337                         raise DownloadError(message)
 338                 self._download_retcode = 1
 339
 340         def slow_down(self, start_time, byte_counter):
 341                 """Sleep if the download speed is over the rate limit."""
 342                 rate_limit = self.params.get('ratelimit', None)
 343                 if rate_limit is None or byte_counter == 0:
 344                         return
 345                 now = time.time()
 346                 elapsed = now - start_time
 347                 if elapsed <= 0.0:
 348                         return
 349                 speed = float(byte_counter) / elapsed
 350                 if speed > rate_limit:
 351                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 352
 353         def report_destination(self, filename):
 354                 """Report destination filename."""
 355                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 356
 357         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 358                 """Report download progress."""
 359                 if self.params.get('noprogress', False):
 360                         return
 361                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 362                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 363
 364         def report_resuming_byte(self, resume_len):
 365                 """Report attemtp to resume at given byte."""
 366                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 367
 368         def report_retry(self, count, retries):
 369                 """Report retry in case of HTTP error 503"""
 370                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
 371
 372         def report_file_already_downloaded(self, file_name):
 373                 """Report file has already been fully downloaded."""
 374                 try:
 375                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 376                 except (UnicodeEncodeError), err:
 377                         self.to_stdout(u'[download] The file has already been downloaded')
 378
 379         def report_unable_to_resume(self):
 380                 """Report it was impossible to resume download."""
 381                 self.to_stdout(u'[download] Unable to resume')
 382
 383         def report_finish(self):
 384                 """Report download finished."""
 385                 if self.params.get('noprogress', False):
 386                         self.to_stdout(u'[download] Download completed')
 387                 else:
 388                         self.to_stdout(u'')
 389
 390         def process_info(self, info_dict):
 391                 """Process a single dictionary returned by an InfoExtractor."""
 392                 # Do nothing else if in simulate mode
 393                 if self.params.get('simulate', False):
 394                         # Verify URL if it's an HTTP one
 395                         if info_dict['url'].startswith('http'):
 396                                 try:
 397                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 398                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 399                                         raise UnavailableFormatError
 400
 401                         # Forced printings
 402                         if self.params.get('forcetitle', False):
 403                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 404                         if self.params.get('forceurl', False):
 405                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 406                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 407                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 408                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 409                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 410
 411                         return
 412
 413                 try:
 414                         template_dict = dict(info_dict)
 415                         template_dict['epoch'] = unicode(long(time.time()))
 416                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 417                         filename = self.params['outtmpl'] % template_dict
 418                 except (ValueError, KeyError), err:
 419                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 420                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 421                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 422                         return
 423
 424                 try:
 425                         self.pmkdir(filename)
 426                 except (OSError, IOError), err:
 427                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 428                         return
 429
 430                 try:
 431                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 432                 except (OSError, IOError), err:
 433                         raise UnavailableFormatError
 434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 435                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 436                         return
 437                 except (ContentTooShortError, ), err:
 438                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 439                         return
 440
 441                 if success:
 442                         try:
 443                                 self.post_process(filename, info_dict)
 444                         except (PostProcessingError), err:
 445                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 446                                 return
 447
 448         def download(self, url_list):
 449                 """Download a given list of URLs."""
 450                 if len(url_list) > 1 and self.fixed_template():
 451                         raise SameFileError(self.params['outtmpl'])
 452
 453                 for url in url_list:
 454                         suitable_found = False
 455                         for ie in self._ies:
 456                                 # Go to next InfoExtractor if not suitable
 457                                 if not ie.suitable(url):
 458                                         continue
 459
 460                                 # Suitable InfoExtractor found
 461                                 suitable_found = True
 462
 463                                 # Extract information from URL and process it
 464                                 ie.extract(url)
 465
 466                                 # Suitable InfoExtractor had been found; go to next URL
 467                                 break
 468
 469                         if not suitable_found:
 470                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 471
 472                 return self._download_retcode
 473
 474         def post_process(self, filename, ie_info):
 475                 """Run the postprocessing chain on the given file."""
 476                 info = dict(ie_info)
 477                 info['filepath'] = filename
 478                 for pp in self._pps:
 479                         info = pp.run(info)
 480                         if info is None:
 481                                 break
 482
 483         def _download_with_rtmpdump(self, filename, url, player_url):
 484                 self.report_destination(filename)
 485
 486                 # Check for rtmpdump first
 487                 try:
 488                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 489                 except (OSError, IOError):
 490                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 491                         return False
 492
 493                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 494                 # the connection was interrumpted and resuming appears to be
 495                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 496                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 497                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 498                 while retval == 2 or retval == 1:
 499                         prevsize = os.path.getsize(filename)
 500                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 501                         time.sleep(5.0) # This seems to be needed
 502                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 503                         cursize = os.path.getsize(filename)
 504                         if prevsize == cursize and retval == 1:
 505                                 break
 506                 if retval == 0:
 507                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 508                         return True
 509                 else:
 510                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 511                         return False
 512
 513         def _do_download(self, filename, url, player_url):
 514                 # Attempt to download using rtmpdump
 515                 if url.startswith('rtmp'):
 516                         return self._download_with_rtmpdump(filename, url, player_url)
 517
 518                 stream = None
 519                 open_mode = 'wb'
 520                 basic_request = urllib2.Request(url, None, std_headers)
 521                 request = urllib2.Request(url, None, std_headers)
 522
 523                 # Establish possible resume length
 524                 if os.path.isfile(filename):
 525                         resume_len = os.path.getsize(filename)
 526                 else:
 527                         resume_len = 0
 528
 529                 # Request parameters in case of being able to resume
 530                 if self.params.get('continuedl', False) and resume_len != 0:
 531                         self.report_resuming_byte(resume_len)
 532                         request.add_header('Range','bytes=%d-' % resume_len)
 533                         open_mode = 'ab'
 534
 535                 count = 0
 536                 retries = self.params.get('retries', 0)
 537                 while True:
 538                         # Establish connection
 539                         try:
 540                                 data = urllib2.urlopen(request)
 541                                 break
 542                         except (urllib2.HTTPError, ), err:
 543                                 if err.code == 503:
 544                                         # Retry in case of HTTP error 503
 545                                         count += 1
 546                                         if count <= retries:
 547                                                 self.report_retry(count, retries)
 548                                                 continue
 549                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
 550                                         raise
 551                                 # Unable to resume
 552                                 data = urllib2.urlopen(basic_request)
 553                                 content_length = data.info()['Content-Length']
 554
 555                                 if content_length is not None and long(content_length) == resume_len:
 556                                         # Because the file had already been fully downloaded
 557                                         self.report_file_already_downloaded(filename)
 558                                         return True
 559                                 else:
 560                                         # Because the server didn't let us
 561                                         self.report_unable_to_resume()
 562                                         open_mode = 'wb'
 563
 564                 data_len = data.info().get('Content-length', None)
 565                 data_len_str = self.format_bytes(data_len)
 566                 byte_counter = 0
 567                 block_size = 1024
 568                 start = time.time()
 569                 while True:
 570                         # Download and write
 571                         before = time.time()
 572                         data_block = data.read(block_size)
 573                         after = time.time()
 574                         data_block_len = len(data_block)
 575                         if data_block_len == 0:
 576                                 break
 577                         byte_counter += data_block_len
 578
 579                         # Open file just in time
 580                         if stream is None:
 581                                 try:
 582                                         (stream, filename) = sanitize_open(filename, open_mode)
 583                                         self.report_destination(filename)
 584                                         self._num_downloads += 1
 585                                 except (OSError, IOError), err:
 586                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 587                                         return False
 588                         try:
 589                                 stream.write(data_block)
 590                         except (IOError, OSError), err:
 591                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 592                         block_size = self.best_block_size(after - before, data_block_len)
 593
 594                         # Progress message
 595                         percent_str = self.calc_percent(byte_counter, data_len)
 596                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 597                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 598                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 599
 600                         # Apply rate limit
 601                         self.slow_down(start, byte_counter)
 602
 603                 self.report_finish()
 604                 if data_len is not None and str(byte_counter) != data_len:
 605                         raise ContentTooShortError(byte_counter, long(data_len))
 606                 return True
 607
 608 class InfoExtractor(object):
 609         """Information Extractor class.
 610
 611         Information extractors are the classes that, given a URL, extract
 612         information from the video (or videos) the URL refers to. This
 613         information includes the real video URL, the video title and simplified
 614         title, author and others. The information is stored in a dictionary
 615         which is then passed to the FileDownloader. The FileDownloader
 616         processes this information possibly downloading the video to the file
 617         system, among other possible outcomes. The dictionaries must include
 618         the following fields:
 619
 620         id:             Video identifier.
 621         url:            Final video URL.
 622         uploader:       Nickname of the video uploader.
 623         title:          Literal title.
 624         stitle:         Simplified title.
 625         ext:            Video filename extension.
 626         format:         Video format.
 627         player_url:     SWF Player URL (may be None).
 628
 629         The following fields are optional. Their primary purpose is to allow
 630         youtube-dl to serve as the backend for a video search function, such
 631         as the one in youtube2mp3.  They are only used when their respective
 632         forced printing functions are called:
 633
 634         thumbnail:      Full URL to a video thumbnail image.
 635         description:    One-line video description.
 636
 637         Subclasses of this one should re-define the _real_initialize() and
 638         _real_extract() methods, as well as the suitable() static method.
 639         Probably, they should also be instantiated and added to the main
 640         downloader.
 641         """
 642
 643         _ready = False
 644         _downloader = None
 645
 646         def __init__(self, downloader=None):
 647                 """Constructor. Receives an optional downloader."""
 648                 self._ready = False
 649                 self.set_downloader(downloader)
 650
 651         @staticmethod
 652         def suitable(url):
 653                 """Receives a URL and returns True if suitable for this IE."""
 654                 return False
 655
 656         def initialize(self):
 657                 """Initializes an instance (authentication, etc)."""
 658                 if not self._ready:
 659                         self._real_initialize()
 660                         self._ready = True
 661
 662         def extract(self, url):
 663                 """Extracts URL information and returns it in list of dicts."""
 664                 self.initialize()
 665                 return self._real_extract(url)
 666
 667         def set_downloader(self, downloader):
 668                 """Sets the downloader for this IE."""
 669                 self._downloader = downloader
 670
 671         def _real_initialize(self):
 672                 """Real initialization process. Redefine in subclasses."""
 673                 pass
 674
 675         def _real_extract(self, url):
 676                 """Real extraction process. Redefine in subclasses."""
 677                 pass
 678
 679 class YoutubeIE(InfoExtractor):
 680         """Information extractor for youtube.com."""
 681
 682         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 683         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 684         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 685         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 686         _NETRC_MACHINE = 'youtube'
 687         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 688         _video_extensions = {
 689                 '13': '3gp',
 690                 '17': 'mp4',
 691                 '18': 'mp4',
 692                 '22': 'mp4',
 693                 '37': 'mp4',
 694         }
 695
 696         @staticmethod
 697         def suitable(url):
 698                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 699
 700         def report_lang(self):
 701                 """Report attempt to set language."""
 702                 self._downloader.to_stdout(u'[youtube] Setting language')
 703
 704         def report_login(self):
 705                 """Report attempt to log in."""
 706                 self._downloader.to_stdout(u'[youtube] Logging in')
 707
 708         def report_age_confirmation(self):
 709                 """Report attempt to confirm age."""
 710                 self._downloader.to_stdout(u'[youtube] Confirming age')
 711
 712         def report_video_webpage_download(self, video_id):
 713                 """Report attempt to download video webpage."""
 714                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 715
 716         def report_video_info_webpage_download(self, video_id):
 717                 """Report attempt to download video info webpage."""
 718                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 719
 720         def report_information_extraction(self, video_id):
 721                 """Report attempt to extract video information."""
 722                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 723
 724         def report_unavailable_format(self, video_id, format):
 725                 """Report extracted video URL."""
 726                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 727
 728         def report_rtmp_download(self):
 729                 """Indicate the download will use the RTMP protocol."""
 730                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 731
 732         def _real_initialize(self):
 733                 if self._downloader is None:
 734                         return
 735
 736                 username = None
 737                 password = None
 738                 downloader_params = self._downloader.params
 739
 740                 # Attempt to use provided username and password or .netrc data
 741                 if downloader_params.get('username', None) is not None:
 742                         username = downloader_params['username']
 743                         password = downloader_params['password']
 744                 elif downloader_params.get('usenetrc', False):
 745                         try:
 746                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 747                                 if info is not None:
 748                                         username = info[0]
 749                                         password = info[2]
 750                                 else:
 751                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 752                         except (IOError, netrc.NetrcParseError), err:
 753                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 754                                 return
 755
 756                 # Set language
 757                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 758                 try:
 759                         self.report_lang()
 760                         urllib2.urlopen(request).read()
 761                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 762                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 763                         return
 764
 765                 # No authentication to be performed
 766                 if username is None:
 767                         return
 768
 769                 # Log in
 770                 login_form = {
 771                                 'current_form': 'loginForm',
 772                                 'next':         '/',
 773                                 'action_login': 'Log In',
 774                                 'username':     username,
 775                                 'password':     password,
 776                                 }
 777                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 778                 try:
 779                         self.report_login()
 780                         login_results = urllib2.urlopen(request).read()
 781                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 782                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 783                                 return
 784                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 785                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 786                         return
 787
 788                 # Confirm age
 789                 age_form = {
 790                                 'next_url':             '/',
 791                                 'action_confirm':       'Confirm',
 792                                 }
 793                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 794                 try:
 795                         self.report_age_confirmation()
 796                         age_results = urllib2.urlopen(request).read()
 797                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 798                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 799                         return
 800
 801         def _real_extract(self, url):
 802                 # Extract video id from URL
 803                 mobj = re.match(self._VALID_URL, url)
 804                 if mobj is None:
 805                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 806                         return
 807                 video_id = mobj.group(2)
 808
 809                 # Downloader parameters
 810                 best_quality = False
 811                 all_formats = False
 812                 format_param = None
 813                 quality_index = 0
 814                 if self._downloader is not None:
 815                         params = self._downloader.params
 816                         format_param = params.get('format', None)
 817                         if format_param == '0':
 818                                 format_param = self._available_formats[quality_index]
 819                                 best_quality = True
 820                         elif format_param == '-1':
 821                                 format_param = self._available_formats[quality_index]
 822                                 all_formats = True
 823
 824                 while True:
 825                         # Extension
 826                         video_extension = self._video_extensions.get(format_param, 'flv')
 827
 828                         # Get video webpage
 829                         self.report_video_webpage_download(video_id)
 830                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 831                         try:
 832                                 video_webpage = urllib2.urlopen(request).read()
 833                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 834                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 835                                 return
 836
 837                         # Attempt to extract SWF player URL
 838                         mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
 839                         if mobj is not None:
 840                                 player_url = mobj.group(1)
 841                         else:
 842                                 player_url = None
 843
 844                         # Get video info
 845                         self.report_video_info_webpage_download(video_id)
 846                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 847                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 848                                                    % (video_id, el_type))
 849                                 request = urllib2.Request(video_info_url, None, std_headers)
 850                                 try:
 851                                         video_info_webpage = urllib2.urlopen(request).read()
 852                                         video_info = parse_qs(video_info_webpage)
 853                                         if 'token' in video_info:
 854                                                 break
 855                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 856                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 857                                         return
 858                         self.report_information_extraction(video_id)
 859
 860                         # "t" param
 861                         if 'token' not in video_info:
 862                                 # Attempt to see if YouTube has issued an error message
 863                                 if 'reason' not in video_info:
 864                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 865                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 866                                         stream.write(video_info_webpage)
 867                                         stream.close()
 868                                 else:
 869                                         reason = urllib.unquote_plus(video_info['reason'][0])
 870                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 871                                 return
 872                         token = urllib.unquote_plus(video_info['token'][0])
 873                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 874                         if format_param is not None:
 875                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 876
 877                         # Check possible RTMP download
 878                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 879                                 self.report_rtmp_download()
 880                                 video_real_url = video_info['conn'][0]
 881
 882                         # uploader
 883                         if 'author' not in video_info:
 884                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 885                                 return
 886                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 887
 888                         # title
 889                         if 'title' not in video_info:
 890                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 891                                 return
 892                         video_title = urllib.unquote_plus(video_info['title'][0])
 893                         video_title = video_title.decode('utf-8')
 894                         video_title = sanitize_title(video_title)
 895
 896                         # simplified title
 897                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 898                         simple_title = simple_title.strip(ur'_')
 899
 900                         # thumbnail image
 901                         if 'thumbnail_url' not in video_info:
 902                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 903                                 video_thumbnail = ''
 904                         else:   # don't panic if we can't find it
 905                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 906
 907                         # description
 908                         video_description = 'No description available.'
 909                         if self._downloader.params.get('forcedescription', False):
 910                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 911                                 if mobj is not None:
 912                                         video_description = mobj.group(1)
 913
 914                         try:
 915                                 # Process video information
 916                                 self._downloader.process_info({
 917                                         'id':           video_id.decode('utf-8'),
 918                                         'url':          video_real_url.decode('utf-8'),
 919                                         'uploader':     video_uploader.decode('utf-8'),
 920                                         'title':        video_title,
 921                                         'stitle':       simple_title,
 922                                         'ext':          video_extension.decode('utf-8'),
 923                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 924                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 925                                         'description':  video_description.decode('utf-8'),
 926                                         'player_url':   player_url,
 927                                 })
 928
 929                                 if all_formats:
 930                                         quality_index += 1
 931                                         if quality_index == len(self._available_formats):
 932                                                 # None left to get
 933                                                 return
 934                                         else:
 935                                                 format_param = self._available_formats[quality_index]
 936                                                 continue
 937                                 return
 938
 939                         except UnavailableFormatError, err:
 940                                 if best_quality or all_formats:
 941                                         quality_index += 1
 942                                         if quality_index == len(self._available_formats):
 943                                                 # I don't ever expect this to happen
 944                                                 if not all_formats:
 945                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 946                                                 return
 947                                         else:
 948                                                 self.report_unavailable_format(video_id, format_param)
 949                                                 format_param = self._available_formats[quality_index]
 950                                                 continue
 951                                 else:
 952                                         self._downloader.trouble('ERROR: format not available for video')
 953                                         return
 954
 955
 956 class MetacafeIE(InfoExtractor):
 957         """Information Extractor for metacafe.com."""
 958
 959         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 960         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 961         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 962         _youtube_ie = None
 963
 964         def __init__(self, youtube_ie, downloader=None):
 965                 InfoExtractor.__init__(self, downloader)
 966                 self._youtube_ie = youtube_ie
 967
 968         @staticmethod
 969         def suitable(url):
 970                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 971
 972         def report_disclaimer(self):
 973                 """Report disclaimer retrieval."""
 974                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 975
 976         def report_age_confirmation(self):
 977                 """Report attempt to confirm age."""
 978                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 979
 980         def report_download_webpage(self, video_id):
 981                 """Report webpage download."""
 982                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 983
 984         def report_extraction(self, video_id):
 985                 """Report information extraction."""
 986                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 987
 988         def _real_initialize(self):
 989                 # Retrieve disclaimer
 990                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 991                 try:
 992                         self.report_disclaimer()
 993                         disclaimer = urllib2.urlopen(request).read()
 994                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 995                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 996                         return
 997
 998                 # Confirm age
 999                 disclaimer_form = {
1000                         'filters': '0',
1001                         'submit': "Continue - I'm over 18",
1002                         }
1003                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1004                 try:
1005                         self.report_age_confirmation()
1006                         disclaimer = urllib2.urlopen(request).read()
1007                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1008                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1009                         return
1010
1011         def _real_extract(self, url):
1012                 # Extract id and simplified title from URL
1013                 mobj = re.match(self._VALID_URL, url)
1014                 if mobj is None:
1015                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1016                         return
1017
1018                 video_id = mobj.group(1)
1019
1020                 # Check if video comes from YouTube
1021                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1022                 if mobj2 is not None:
1023                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1024                         return
1025
1026                 simple_title = mobj.group(2).decode('utf-8')
1027                 video_extension = 'flv'
1028
1029                 # Retrieve video webpage to extract further information
1030                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1031                 try:
1032                         self.report_download_webpage(video_id)
1033                         webpage = urllib2.urlopen(request).read()
1034                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1035                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1036                         return
1037
1038                 # Extract URL, uploader and title from webpage
1039                 self.report_extraction(video_id)
1040                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1041                 if mobj is None:
1042                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1043                         return
1044                 mediaURL = urllib.unquote(mobj.group(1))
1045
1046                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1047                 #if mobj is None:
1048                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1049                 #       return
1050                 #gdaKey = mobj.group(1)
1051                 #
1052                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1053
1054                 video_url = mediaURL
1055
1056                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1057                 if mobj is None:
1058                         self._downloader.trouble(u'ERROR: unable to extract title')
1059                         return
1060                 video_title = mobj.group(1).decode('utf-8')
1061                 video_title = sanitize_title(video_title)
1062
1063                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1064                 if mobj is None:
1065                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1066                         return
1067                 video_uploader = mobj.group(1)
1068
1069                 try:
1070                         # Process video information
1071                         self._downloader.process_info({
1072                                 'id':           video_id.decode('utf-8'),
1073                                 'url':          video_url.decode('utf-8'),
1074                                 'uploader':     video_uploader.decode('utf-8'),
1075                                 'title':        video_title,
1076                                 'stitle':       simple_title,
1077                                 'ext':          video_extension.decode('utf-8'),
1078                                 'format':       u'NA',
1079                                 'player_url':   None,
1080                         })
1081                 except UnavailableFormatError:
1082                         self._downloader.trouble(u'ERROR: format not available for video')
1083
1084
1085 class GoogleIE(InfoExtractor):
1086         """Information extractor for video.google.com."""
1087
1088         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1089
1090         def __init__(self, downloader=None):
1091                 InfoExtractor.__init__(self, downloader)
1092
1093         @staticmethod
1094         def suitable(url):
1095                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1096
1097         def report_download_webpage(self, video_id):
1098                 """Report webpage download."""
1099                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1100
1101         def report_extraction(self, video_id):
1102                 """Report information extraction."""
1103                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1104
1105         def _real_initialize(self):
1106                 return
1107
1108         def _real_extract(self, url):
1109                 # Extract id from URL
1110                 mobj = re.match(self._VALID_URL, url)
1111                 if mobj is None:
1112                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1113                         return
1114
1115                 video_id = mobj.group(1)
1116
1117                 video_extension = 'mp4'
1118
1119                 # Retrieve video webpage to extract further information
1120                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1121                 try:
1122                         self.report_download_webpage(video_id)
1123                         webpage = urllib2.urlopen(request).read()
1124                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1125                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1126                         return
1127
1128                 # Extract URL, uploader, and title from webpage
1129                 self.report_extraction(video_id)
1130                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1131                 if mobj is None:
1132                         video_extension = 'flv'
1133                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1134                 if mobj is None:
1135                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1136                         return
1137                 mediaURL = urllib.unquote(mobj.group(1))
1138                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1139                 mediaURL = mediaURL.replace('\\x26', '\x26')
1140
1141                 video_url = mediaURL
1142
1143                 mobj = re.search(r'<title>(.*)</title>', webpage)
1144                 if mobj is None:
1145                         self._downloader.trouble(u'ERROR: unable to extract title')
1146                         return
1147                 video_title = mobj.group(1).decode('utf-8')
1148                 video_title = sanitize_title(video_title)
1149                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1150
1151                 # Extract video description
1152                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1153                 if mobj is None:
1154                         self._downloader.trouble(u'ERROR: unable to extract video description')
1155                         return
1156                 video_description = mobj.group(1).decode('utf-8')
1157                 if not video_description:
1158                         video_description = 'No description available.'
1159
1160                 # Extract video thumbnail
1161                 if self._downloader.params.get('forcethumbnail', False):
1162                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1163                         try:
1164                                 webpage = urllib2.urlopen(request).read()
1165                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1166                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1167                                 return
1168                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1169                         if mobj is None:
1170                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1171                                 return
1172                         video_thumbnail = mobj.group(1)
1173                 else:   # we need something to pass to process_info
1174                         video_thumbnail = ''
1175
1176
1177                 try:
1178                         # Process video information
1179                         self._downloader.process_info({
1180                                 'id':           video_id.decode('utf-8'),
1181                                 'url':          video_url.decode('utf-8'),
1182                                 'uploader':     u'NA',
1183                                 'title':        video_title,
1184                                 'stitle':       simple_title,
1185                                 'ext':          video_extension.decode('utf-8'),
1186                                 'format':       u'NA',
1187                                 'player_url':   None,
1188                         })
1189                 except UnavailableFormatError:
1190                         self._downloader.trouble(u'ERROR: format not available for video')
1191
1192
1193 class PhotobucketIE(InfoExtractor):
1194         """Information extractor for photobucket.com."""
1195
1196         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1197
1198         def __init__(self, downloader=None):
1199                 InfoExtractor.__init__(self, downloader)
1200
1201         @staticmethod
1202         def suitable(url):
1203                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1204
1205         def report_download_webpage(self, video_id):
1206                 """Report webpage download."""
1207                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1208
1209         def report_extraction(self, video_id):
1210                 """Report information extraction."""
1211                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1212
1213         def _real_initialize(self):
1214                 return
1215
1216         def _real_extract(self, url):
1217                 # Extract id from URL
1218                 mobj = re.match(self._VALID_URL, url)
1219                 if mobj is None:
1220                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1221                         return
1222
1223                 video_id = mobj.group(1)
1224
1225                 video_extension = 'flv'
1226
1227                 # Retrieve video webpage to extract further information
1228                 request = urllib2.Request(url)
1229                 try:
1230                         self.report_download_webpage(video_id)
1231                         webpage = urllib2.urlopen(request).read()
1232                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1234                         return
1235
1236                 # Extract URL, uploader, and title from webpage
1237                 self.report_extraction(video_id)
1238                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1239                 if mobj is None:
1240                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1241                         return
1242                 mediaURL = urllib.unquote(mobj.group(1))
1243
1244                 video_url = mediaURL
1245
1246                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1247                 if mobj is None:
1248                         self._downloader.trouble(u'ERROR: unable to extract title')
1249                         return
1250                 video_title = mobj.group(1).decode('utf-8')
1251                 video_title = sanitize_title(video_title)
1252                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1253
1254                 video_uploader = mobj.group(2).decode('utf-8')
1255
1256                 try:
1257                         # Process video information
1258                         self._downloader.process_info({
1259                                 'id':           video_id.decode('utf-8'),
1260                                 'url':          video_url.decode('utf-8'),
1261                                 'uploader':     video_uploader,
1262                                 'title':        video_title,
1263                                 'stitle':       simple_title,
1264                                 'ext':          video_extension.decode('utf-8'),
1265                                 'format':       u'NA',
1266                                 'player_url':   None,
1267                         })
1268                 except UnavailableFormatError:
1269                         self._downloader.trouble(u'ERROR: format not available for video')
1270
1271
1272 class YahooIE(InfoExtractor):
1273         """Information extractor for video.yahoo.com."""
1274
1275         # _VALID_URL matches all Yahoo! Video URLs
1276         # _VPAGE_URL matches only the extractable '/watch/' URLs
1277         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1278         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1279
1280         def __init__(self, downloader=None):
1281                 InfoExtractor.__init__(self, downloader)
1282
1283         @staticmethod
1284         def suitable(url):
1285                 return (re.match(YahooIE._VALID_URL, url) is not None)
1286
1287         def report_download_webpage(self, video_id):
1288                 """Report webpage download."""
1289                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1290
1291         def report_extraction(self, video_id):
1292                 """Report information extraction."""
1293                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1294
1295         def _real_initialize(self):
1296                 return
1297
1298         def _real_extract(self, url):
1299                 # Extract ID from URL
1300                 mobj = re.match(self._VALID_URL, url)
1301                 if mobj is None:
1302                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1303                         return
1304
1305                 video_id = mobj.group(2)
1306                 video_extension = 'flv'
1307
1308                 # Rewrite valid but non-extractable URLs as
1309                 # extractable English language /watch/ URLs
1310                 if re.match(self._VPAGE_URL, url) is None:
1311                         request = urllib2.Request(url)
1312                         try:
1313                                 webpage = urllib2.urlopen(request).read()
1314                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1315                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1316                                 return
1317
1318                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1319                         if mobj is None:
1320                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1321                                 return
1322                         yahoo_id = mobj.group(1)
1323
1324                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1325                         if mobj is None:
1326                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1327                                 return
1328                         yahoo_vid = mobj.group(1)
1329
1330                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1331                         return self._real_extract(url)
1332
1333                 # Retrieve video webpage to extract further information
1334                 request = urllib2.Request(url)
1335                 try:
1336                         self.report_download_webpage(video_id)
1337                         webpage = urllib2.urlopen(request).read()
1338                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1339                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1340                         return
1341
1342                 # Extract uploader and title from webpage
1343                 self.report_extraction(video_id)
1344                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1345                 if mobj is None:
1346                         self._downloader.trouble(u'ERROR: unable to extract video title')
1347                         return
1348                 video_title = mobj.group(1).decode('utf-8')
1349                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1350
1351                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1352                 if mobj is None:
1353                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1354                         return
1355                 video_uploader = mobj.group(1).decode('utf-8')
1356
1357                 # Extract video thumbnail
1358                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1359                 if mobj is None:
1360                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1361                         return
1362                 video_thumbnail = mobj.group(1).decode('utf-8')
1363
1364                 # Extract video description
1365                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1366                 if mobj is None:
1367                         self._downloader.trouble(u'ERROR: unable to extract video description')
1368                         return
1369                 video_description = mobj.group(1).decode('utf-8')
1370                 if not video_description: video_description = 'No description available.'
1371
1372                 # Extract video height and width
1373                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1374                 if mobj is None:
1375                         self._downloader.trouble(u'ERROR: unable to extract video height')
1376                         return
1377                 yv_video_height = mobj.group(1)
1378
1379                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1380                 if mobj is None:
1381                         self._downloader.trouble(u'ERROR: unable to extract video width')
1382                         return
1383                 yv_video_width = mobj.group(1)
1384
1385                 # Retrieve video playlist to extract media URL
1386                 # I'm not completely sure what all these options are, but we
1387                 # seem to need most of them, otherwise the server sends a 401.
1388                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1389                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1390                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1391                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1392                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1393                 try:
1394                         self.report_download_webpage(video_id)
1395                         webpage = urllib2.urlopen(request).read()
1396                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1397                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1398                         return
1399
1400                 # Extract media URL from playlist XML
1401                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1402                 if mobj is None:
1403                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1404                         return
1405                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1406                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1407
1408                 try:
1409                         # Process video information
1410                         self._downloader.process_info({
1411                                 'id':           video_id.decode('utf-8'),
1412                                 'url':          video_url,
1413                                 'uploader':     video_uploader,
1414                                 'title':        video_title,
1415                                 'stitle':       simple_title,
1416                                 'ext':          video_extension.decode('utf-8'),
1417                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1418                                 'description':  video_description,
1419                                 'thumbnail':    video_thumbnail,
1420                                 'description':  video_description,
1421                                 'player_url':   None,
1422                         })
1423                 except UnavailableFormatError:
1424                         self._downloader.trouble(u'ERROR: format not available for video')
1425
1426
1427 class GenericIE(InfoExtractor):
1428         """Generic last-resort information extractor."""
1429
1430         def __init__(self, downloader=None):
1431                 InfoExtractor.__init__(self, downloader)
1432
1433         @staticmethod
1434         def suitable(url):
1435                 return True
1436
1437         def report_download_webpage(self, video_id):
1438                 """Report webpage download."""
1439                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1440                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1441
1442         def report_extraction(self, video_id):
1443                 """Report information extraction."""
1444                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1445
1446         def _real_initialize(self):
1447                 return
1448
1449         def _real_extract(self, url):
1450                 video_id = url.split('/')[-1]
1451                 request = urllib2.Request(url)
1452                 try:
1453                         self.report_download_webpage(video_id)
1454                         webpage = urllib2.urlopen(request).read()
1455                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1456                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1457                         return
1458                 except ValueError, err:
1459                         # since this is the last-resort InfoExtractor, if
1460                         # this error is thrown, it'll be thrown here
1461                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1462                         return
1463
1464                 # Start with something easy: JW Player in SWFObject
1465                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1466                 if mobj is None:
1467                         # Broaden the search a little bit
1468                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1469                 if mobj is None:
1470                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1471                         return
1472
1473                 # It's possible that one of the regexes
1474                 # matched, but returned an empty group:
1475                 if mobj.group(1) is None:
1476                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1477                         return
1478
1479                 video_url = urllib.unquote(mobj.group(1))
1480                 video_id  = os.path.basename(video_url)
1481
1482                 # here's a fun little line of code for you:
1483                 video_extension = os.path.splitext(video_id)[1][1:]
1484                 video_id        = os.path.splitext(video_id)[0]
1485
1486                 # it's tempting to parse this further, but you would
1487                 # have to take into account all the variations like
1488                 #   Video Title - Site Name
1489                 #   Site Name | Video Title
1490                 #   Video Title - Tagline | Site Name
1491                 # and so on and so forth; it's just not practical
1492                 mobj = re.search(r'<title>(.*)</title>', webpage)
1493                 if mobj is None:
1494                         self._downloader.trouble(u'ERROR: unable to extract title')
1495                         return
1496                 video_title = mobj.group(1).decode('utf-8')
1497                 video_title = sanitize_title(video_title)
1498                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1499
1500                 # video uploader is domain name
1501                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1502                 if mobj is None:
1503                         self._downloader.trouble(u'ERROR: unable to extract title')
1504                         return
1505                 video_uploader = mobj.group(1).decode('utf-8')
1506
1507                 try:
1508                         # Process video information
1509                         self._downloader.process_info({
1510                                 'id':           video_id.decode('utf-8'),
1511                                 'url':          video_url.decode('utf-8'),
1512                                 'uploader':     video_uploader,
1513                                 'title':        video_title,
1514                                 'stitle':       simple_title,
1515                                 'ext':          video_extension.decode('utf-8'),
1516                                 'format':       u'NA',
1517                                 'player_url':   None,
1518                         })
1519                 except UnavailableFormatError:
1520                         self._downloader.trouble(u'ERROR: format not available for video')
1521
1522
1523 class YoutubeSearchIE(InfoExtractor):
1524         """Information Extractor for YouTube search queries."""
1525         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1526         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1527         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1528         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1529         _youtube_ie = None
1530         _max_youtube_results = 1000
1531
1532         def __init__(self, youtube_ie, downloader=None):
1533                 InfoExtractor.__init__(self, downloader)
1534                 self._youtube_ie = youtube_ie
1535
1536         @staticmethod
1537         def suitable(url):
1538                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1539
1540         def report_download_page(self, query, pagenum):
1541                 """Report attempt to download playlist page with given number."""
1542                 query = query.decode(preferredencoding())
1543                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1544
1545         def _real_initialize(self):
1546                 self._youtube_ie.initialize()
1547
1548         def _real_extract(self, query):
1549                 mobj = re.match(self._VALID_QUERY, query)
1550                 if mobj is None:
1551                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1552                         return
1553
1554                 prefix, query = query.split(':')
1555                 prefix = prefix[8:]
1556                 query  = query.encode('utf-8')
1557                 if prefix == '':
1558                         self._download_n_results(query, 1)
1559                         return
1560                 elif prefix == 'all':
1561                         self._download_n_results(query, self._max_youtube_results)
1562                         return
1563                 else:
1564                         try:
1565                                 n = long(prefix)
1566                                 if n <= 0:
1567                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1568                                         return
1569                                 elif n > self._max_youtube_results:
1570                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1571                                         n = self._max_youtube_results
1572                                 self._download_n_results(query, n)
1573                                 return
1574                         except ValueError: # parsing prefix as integer fails
1575                                 self._download_n_results(query, 1)
1576                                 return
1577
1578         def _download_n_results(self, query, n):
1579                 """Downloads a specified number of results for a query"""
1580
1581                 video_ids = []
1582                 already_seen = set()
1583                 pagenum = 1
1584
1585                 while True:
1586                         self.report_download_page(query, pagenum)
1587                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1588                         request = urllib2.Request(result_url, None, std_headers)
1589                         try:
1590                                 page = urllib2.urlopen(request).read()
1591                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1592                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1593                                 return
1594
1595                         # Extract video identifiers
1596                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1597                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1598                                 if video_id not in already_seen:
1599                                         video_ids.append(video_id)
1600                                         already_seen.add(video_id)
1601                                         if len(video_ids) == n:
1602                                                 # Specified n videos reached
1603                                                 for id in video_ids:
1604                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1605                                                 return
1606
1607                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1608                                 for id in video_ids:
1609                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1610                                 return
1611
1612                         pagenum = pagenum + 1
1613
1614 class GoogleSearchIE(InfoExtractor):
1615         """Information Extractor for Google Video search queries."""
1616         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1617         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1618         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1619         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1620         _google_ie = None
1621         _max_google_results = 1000
1622
1623         def __init__(self, google_ie, downloader=None):
1624                 InfoExtractor.__init__(self, downloader)
1625                 self._google_ie = google_ie
1626
1627         @staticmethod
1628         def suitable(url):
1629                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1630
1631         def report_download_page(self, query, pagenum):
1632                 """Report attempt to download playlist page with given number."""
1633                 query = query.decode(preferredencoding())
1634                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1635
1636         def _real_initialize(self):
1637                 self._google_ie.initialize()
1638
1639         def _real_extract(self, query):
1640                 mobj = re.match(self._VALID_QUERY, query)
1641                 if mobj is None:
1642                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1643                         return
1644
1645                 prefix, query = query.split(':')
1646                 prefix = prefix[8:]
1647                 query  = query.encode('utf-8')
1648                 if prefix == '':
1649                         self._download_n_results(query, 1)
1650                         return
1651                 elif prefix == 'all':
1652                         self._download_n_results(query, self._max_google_results)
1653                         return
1654                 else:
1655                         try:
1656                                 n = long(prefix)
1657                                 if n <= 0:
1658                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1659                                         return
1660                                 elif n > self._max_google_results:
1661                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1662                                         n = self._max_google_results
1663                                 self._download_n_results(query, n)
1664                                 return
1665                         except ValueError: # parsing prefix as integer fails
1666                                 self._download_n_results(query, 1)
1667                                 return
1668
1669         def _download_n_results(self, query, n):
1670                 """Downloads a specified number of results for a query"""
1671
1672                 video_ids = []
1673                 already_seen = set()
1674                 pagenum = 1
1675
1676                 while True:
1677                         self.report_download_page(query, pagenum)
1678                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1679                         request = urllib2.Request(result_url, None, std_headers)
1680                         try:
1681                                 page = urllib2.urlopen(request).read()
1682                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1684                                 return
1685
1686                         # Extract video identifiers
1687                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1688                                 video_id = mobj.group(1)
1689                                 if video_id not in already_seen:
1690                                         video_ids.append(video_id)
1691                                         already_seen.add(video_id)
1692                                         if len(video_ids) == n:
1693                                                 # Specified n videos reached
1694                                                 for id in video_ids:
1695                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1696                                                 return
1697
1698                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1699                                 for id in video_ids:
1700                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1701                                 return
1702
1703                         pagenum = pagenum + 1
1704
1705 class YahooSearchIE(InfoExtractor):
1706         """Information Extractor for Yahoo! Video search queries."""
1707         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1708         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1709         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1710         _MORE_PAGES_INDICATOR = r'\s*Next'
1711         _yahoo_ie = None
1712         _max_yahoo_results = 1000
1713
1714         def __init__(self, yahoo_ie, downloader=None):
1715                 InfoExtractor.__init__(self, downloader)
1716                 self._yahoo_ie = yahoo_ie
1717
1718         @staticmethod
1719         def suitable(url):
1720                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1721
1722         def report_download_page(self, query, pagenum):
1723                 """Report attempt to download playlist page with given number."""
1724                 query = query.decode(preferredencoding())
1725                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1726
1727         def _real_initialize(self):
1728                 self._yahoo_ie.initialize()
1729
1730         def _real_extract(self, query):
1731                 mobj = re.match(self._VALID_QUERY, query)
1732                 if mobj is None:
1733                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1734                         return
1735
1736                 prefix, query = query.split(':')
1737                 prefix = prefix[8:]
1738                 query  = query.encode('utf-8')
1739                 if prefix == '':
1740                         self._download_n_results(query, 1)
1741                         return
1742                 elif prefix == 'all':
1743                         self._download_n_results(query, self._max_yahoo_results)
1744                         return
1745                 else:
1746                         try:
1747                                 n = long(prefix)
1748                                 if n <= 0:
1749                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1750                                         return
1751                                 elif n > self._max_yahoo_results:
1752                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1753                                         n = self._max_yahoo_results
1754                                 self._download_n_results(query, n)
1755                                 return
1756                         except ValueError: # parsing prefix as integer fails
1757                                 self._download_n_results(query, 1)
1758                                 return
1759
1760         def _download_n_results(self, query, n):
1761                 """Downloads a specified number of results for a query"""
1762
1763                 video_ids = []
1764                 already_seen = set()
1765                 pagenum = 1
1766
1767                 while True:
1768                         self.report_download_page(query, pagenum)
1769                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1770                         request = urllib2.Request(result_url, None, std_headers)
1771                         try:
1772                                 page = urllib2.urlopen(request).read()
1773                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1774                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1775                                 return
1776
1777                         # Extract video identifiers
1778                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1779                                 video_id = mobj.group(1)
1780                                 if video_id not in already_seen:
1781                                         video_ids.append(video_id)
1782                                         already_seen.add(video_id)
1783                                         if len(video_ids) == n:
1784                                                 # Specified n videos reached
1785                                                 for id in video_ids:
1786                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1787                                                 return
1788
1789                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1790                                 for id in video_ids:
1791                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1792                                 return
1793
1794                         pagenum = pagenum + 1
1795
1796 class YoutubePlaylistIE(InfoExtractor):
1797         """Information Extractor for YouTube playlists."""
1798
1799         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1800         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1801         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1802         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1803         _youtube_ie = None
1804
1805         def __init__(self, youtube_ie, downloader=None):
1806                 InfoExtractor.__init__(self, downloader)
1807                 self._youtube_ie = youtube_ie
1808
1809         @staticmethod
1810         def suitable(url):
1811                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1812
1813         def report_download_page(self, playlist_id, pagenum):
1814                 """Report attempt to download playlist page with given number."""
1815                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1816
1817         def _real_initialize(self):
1818                 self._youtube_ie.initialize()
1819
1820         def _real_extract(self, url):
1821                 # Extract playlist id
1822                 mobj = re.match(self._VALID_URL, url)
1823                 if mobj is None:
1824                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1825                         return
1826
1827                 # Download playlist pages
1828                 playlist_id = mobj.group(1)
1829                 video_ids = []
1830                 pagenum = 1
1831
1832                 while True:
1833                         self.report_download_page(playlist_id, pagenum)
1834                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1835                         try:
1836                                 page = urllib2.urlopen(request).read()
1837                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1838                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1839                                 return
1840
1841                         # Extract video identifiers
1842                         ids_in_page = []
1843                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1844                                 if mobj.group(1) not in ids_in_page:
1845                                         ids_in_page.append(mobj.group(1))
1846                         video_ids.extend(ids_in_page)
1847
1848                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1849                                 break
1850                         pagenum = pagenum + 1
1851
1852                 for id in video_ids:
1853                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1854                 return
1855
1856 class YoutubeUserIE(InfoExtractor):
1857         """Information Extractor for YouTube users."""
1858
1859         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1860         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1861         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1862         _youtube_ie = None
1863
1864         def __init__(self, youtube_ie, downloader=None):
1865                 InfoExtractor.__init__(self, downloader)
1866                 self._youtube_ie = youtube_ie
1867
1868         @staticmethod
1869         def suitable(url):
1870                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1871
1872         def report_download_page(self, username):
1873                 """Report attempt to download user page."""
1874                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1875
1876         def _real_initialize(self):
1877                 self._youtube_ie.initialize()
1878
1879         def _real_extract(self, url):
1880                 # Extract username
1881                 mobj = re.match(self._VALID_URL, url)
1882                 if mobj is None:
1883                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1884                         return
1885
1886                 # Download user page
1887                 username = mobj.group(1)
1888                 video_ids = []
1889                 pagenum = 1
1890
1891                 self.report_download_page(username)
1892                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1893                 try:
1894                         page = urllib2.urlopen(request).read()
1895                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1896                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1897                         return
1898
1899                 # Extract video identifiers
1900                 ids_in_page = []
1901
1902                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1903                         if mobj.group(1) not in ids_in_page:
1904                                 ids_in_page.append(mobj.group(1))
1905                 video_ids.extend(ids_in_page)
1906
1907                 for id in video_ids:
1908                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1909                 return
1910
1911 class PostProcessor(object):
1912         """Post Processor class.
1913
1914         PostProcessor objects can be added to downloaders with their
1915         add_post_processor() method. When the downloader has finished a
1916         successful download, it will take its internal chain of PostProcessors
1917         and start calling the run() method on each one of them, first with
1918         an initial argument and then with the returned value of the previous
1919         PostProcessor.
1920
1921         The chain will be stopped if one of them ever returns None or the end
1922         of the chain is reached.
1923
1924         PostProcessor objects follow a "mutual registration" process similar
1925         to InfoExtractor objects.
1926         """
1927
1928         _downloader = None
1929
1930         def __init__(self, downloader=None):
1931                 self._downloader = downloader
1932
1933         def set_downloader(self, downloader):
1934                 """Sets the downloader for this PP."""
1935                 self._downloader = downloader
1936
1937         def run(self, information):
1938                 """Run the PostProcessor.
1939
1940                 The "information" argument is a dictionary like the ones
1941                 composed by InfoExtractors. The only difference is that this
1942                 one has an extra field called "filepath" that points to the
1943                 downloaded file.
1944
1945                 When this method returns None, the postprocessing chain is
1946                 stopped. However, this method may return an information
1947                 dictionary that will be passed to the next postprocessing
1948                 object in the chain. It can be the one it received after
1949                 changing some fields.
1950
1951                 In addition, this method may raise a PostProcessingError
1952                 exception that will be taken into account by the downloader
1953                 it was called from.
1954                 """
1955                 return information # by default, do nothing
1956
1957 ### MAIN PROGRAM ###
1958 if __name__ == '__main__':
1959         try:
1960                 # Modules needed only when running the main program
1961                 import getpass
1962                 import optparse
1963
1964                 # Function to update the program file with the latest version from bitbucket.org
1965                 def update_self(downloader, filename):
1966                         # Note: downloader only used for options
1967                         if not os.access (filename, os.W_OK):
1968                                 sys.exit('ERROR: no write permissions on %s' % filename)
1969
1970                         downloader.to_stdout('Updating to latest stable version...')
1971                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1972                         latest_version = urllib.urlopen(latest_url).read().strip()
1973                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1974                         newcontent = urllib.urlopen(prog_url).read()
1975                         stream = open(filename, 'w')
1976                         stream.write(newcontent)
1977                         stream.close()
1978                         downloader.to_stdout('Updated to version %s' % latest_version)
1979
1980                 # General configuration
1981                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1982                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1983                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1984
1985                 # Parse command line
1986                 parser = optparse.OptionParser(
1987                         usage='Usage: %prog [options] url...',
1988                         version='2010.04.04',
1989                         conflict_handler='resolve',
1990                 )
1991
1992                 parser.add_option('-h', '--help',
1993                                 action='help', help='print this help text and exit')
1994                 parser.add_option('-v', '--version',
1995                                 action='version', help='print program version and exit')
1996                 parser.add_option('-U', '--update',
1997                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1998                 parser.add_option('-i', '--ignore-errors',
1999                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2000                 parser.add_option('-r', '--rate-limit',
2001                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
2002                 parser.add_option('-R', '--retries',
2003                                 dest='retries', metavar='T', help='number of retries (default is 10)', default=10)
2004
2005                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2006                 authentication.add_option('-u', '--username',
2007                                 dest='username', metavar='UN', help='account username')
2008                 authentication.add_option('-p', '--password',
2009                                 dest='password', metavar='PW', help='account password')
2010                 authentication.add_option('-n', '--netrc',
2011                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2012                 parser.add_option_group(authentication)
2013
2014                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2015                 video_format.add_option('-f', '--format',
2016                                 action='store', dest='format', metavar='FMT', help='video format code')
2017                 video_format.add_option('-b', '--best-quality',
2018                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
2019                 video_format.add_option('-m', '--mobile-version',
2020                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2021                 video_format.add_option('-d', '--high-def',
2022                                 action='store_const', dest='format', help='alias for -f 22', const='22')
2023                 video_format.add_option('--all-formats',
2024                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2025                 parser.add_option_group(video_format)
2026
2027                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2028                 verbosity.add_option('-q', '--quiet',
2029                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2030                 verbosity.add_option('-s', '--simulate',
2031                                 action='store_true', dest='simulate', help='do not download video', default=False)
2032                 verbosity.add_option('-g', '--get-url',
2033                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2034                 verbosity.add_option('-e', '--get-title',
2035                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2036                 verbosity.add_option('--get-thumbnail',
2037                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2038                 verbosity.add_option('--get-description',
2039                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2040                 verbosity.add_option('--no-progress',
2041                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2042                 parser.add_option_group(verbosity)
2043
2044                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2045                 filesystem.add_option('-t', '--title',
2046                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2047                 filesystem.add_option('-l', '--literal',
2048                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2049                 filesystem.add_option('-o', '--output',
2050                                 dest='outtmpl', metavar='TPL', help='output filename template')
2051                 filesystem.add_option('-a', '--batch-file',
2052                                 dest='batchfile', metavar='F', help='file containing URLs to download (\'-\' for stdin)')
2053                 filesystem.add_option('-w', '--no-overwrites',
2054                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2055                 filesystem.add_option('-c', '--continue',
2056                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2057                 parser.add_option_group(filesystem)
2058
2059                 (opts, args) = parser.parse_args()
2060
2061                 # Batch file verification
2062                 batchurls = []
2063                 if opts.batchfile is not None:
2064                         try:
2065                                 if opts.batchfile == '-':
2066                                         batchfd = sys.stdin
2067                                 else:
2068                                         batchfd = open(opts.batchfile, 'r')
2069                                 batchurls = batchfd.readlines()
2070                                 batchurls = [x.strip() for x in batchurls]
2071                                 batchurls = [x for x in batchurls if len(x) > 0]
2072                         except IOError:
2073                                 sys.exit(u'ERROR: batch file could not be read')
2074                 all_urls = batchurls + args
2075
2076                 # Conflicting, missing and erroneous options
2077                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2078                         parser.error(u'using .netrc conflicts with giving username/password')
2079                 if opts.password is not None and opts.username is None:
2080                         parser.error(u'account username missing')
2081                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2082                         parser.error(u'using output template conflicts with using title or literal title')
2083                 if opts.usetitle and opts.useliteral:
2084                         parser.error(u'using title conflicts with using literal title')
2085                 if opts.username is not None and opts.password is None:
2086                         opts.password = getpass.getpass(u'Type account password and press return:')
2087                 if opts.ratelimit is not None:
2088                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2089                         if numeric_limit is None:
2090                                 parser.error(u'invalid rate limit specified')
2091                         opts.ratelimit = numeric_limit
2092                 if opts.retries is not None:
2093                         try:
2094                                 opts.retries = long(opts.retries)
2095                         except (TypeError, ValueError), err:
2096                                 parser.error(u'invalid retry count specified')
2097
2098                 # Information extractors
2099                 youtube_ie = YoutubeIE()
2100                 metacafe_ie = MetacafeIE(youtube_ie)
2101                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2102                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2103                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2104                 google_ie = GoogleIE()
2105                 google_search_ie = GoogleSearchIE(google_ie)
2106                 photobucket_ie = PhotobucketIE()
2107                 yahoo_ie = YahooIE()
2108                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2109                 generic_ie = GenericIE()
2110
2111                 # File downloader
2112                 fd = FileDownloader({
2113                         'usenetrc': opts.usenetrc,
2114                         'username': opts.username,
2115                         'password': opts.password,
2116                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2117                         'forceurl': opts.geturl,
2118                         'forcetitle': opts.gettitle,
2119                         'forcethumbnail': opts.getthumbnail,
2120                         'forcedescription': opts.getdescription,
2121                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2122                         'format': opts.format,
2123                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2124                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2125                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2126                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2127                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2128                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2129                                 or u'%(id)s.%(ext)s'),
2130                         'ignoreerrors': opts.ignoreerrors,
2131                         'ratelimit': opts.ratelimit,
2132                         'nooverwrites': opts.nooverwrites,
2133                         'retries': opts.retries,
2134                         'continuedl': opts.continue_dl,
2135                         'noprogress': opts.noprogress,
2136                         })
2137                 fd.add_info_extractor(youtube_search_ie)
2138                 fd.add_info_extractor(youtube_pl_ie)
2139                 fd.add_info_extractor(youtube_user_ie)
2140                 fd.add_info_extractor(metacafe_ie)
2141                 fd.add_info_extractor(youtube_ie)
2142                 fd.add_info_extractor(google_ie)
2143                 fd.add_info_extractor(google_search_ie)
2144                 fd.add_info_extractor(photobucket_ie)
2145                 fd.add_info_extractor(yahoo_ie)
2146                 fd.add_info_extractor(yahoo_search_ie)
2147
2148                 # This must come last since it's the
2149                 # fallback if none of the others work
2150                 fd.add_info_extractor(generic_ie)
2151
2152                 # Update version
2153                 if opts.update_self:
2154                         update_self(fd, sys.argv[0])
2155
2156                 # Maybe do nothing
2157                 if len(all_urls) < 1:
2158                         if not opts.update_self:
2159                                 parser.error(u'you must provide at least one URL')
2160                         else:
2161                                 sys.exit()
2162                 retcode = fd.download(all_urls)
2163                 sys.exit(retcode)
2164
2165         except DownloadError:
2166                 sys.exit(1)
2167         except SameFileError:
2168                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2169         except KeyboardInterrupt:
2170                 sys.exit(u'\nERROR: Interrupted by user')