_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.7) Gecko/20100720 Firefox/3.6.7',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableVideoError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         format_limit:   Highest quality format to try.
 193         outtmpl:        Template for output names.
 194         ignoreerrors:   Do not stop on download errors.
 195         ratelimit:      Download speed limit, in bytes/sec.
 196         nooverwrites:   Prevent overwriting files.
 197         retries:        Number of times to retry for HTTP error 503
 198         continuedl:     Try to continue downloads if possible.
 199         noprogress:     Do not print the progress bar.
 200         """
 201
 202         params = None
 203         _ies = []
 204         _pps = []
 205         _download_retcode = None
 206         _num_downloads = None
 207
 208         def __init__(self, params):
 209                 """Create a FileDownloader object with the given options."""
 210                 self._ies = []
 211                 self._pps = []
 212                 self._download_retcode = 0
 213                 self._num_downloads = 0
 214                 self.params = params
 215
 216         @staticmethod
 217         def pmkdir(filename):
 218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 219                 components = filename.split(os.sep)
 220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 222                 for dir in aggregate:
 223                         if not os.path.exists(dir):
 224                                 os.mkdir(dir)
 225
 226         @staticmethod
 227         def format_bytes(bytes):
 228                 if bytes is None:
 229                         return 'N/A'
 230                 if type(bytes) is str:
 231                         bytes = float(bytes)
 232                 if bytes == 0.0:
 233                         exponent = 0
 234                 else:
 235                         exponent = long(math.log(bytes, 1024.0))
 236                 suffix = 'bkMGTPEZY'[exponent]
 237                 converted = float(bytes) / float(1024**exponent)
 238                 return '%.2f%s' % (converted, suffix)
 239
 240         @staticmethod
 241         def calc_percent(byte_counter, data_len):
 242                 if data_len is None:
 243                         return '---.-%'
 244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 245
 246         @staticmethod
 247         def calc_eta(start, now, total, current):
 248                 if total is None:
 249                         return '--:--'
 250                 dif = now - start
 251                 if current == 0 or dif < 0.001: # One millisecond
 252                         return '--:--'
 253                 rate = float(current) / dif
 254                 eta = long((float(total) - float(current)) / rate)
 255                 (eta_mins, eta_secs) = divmod(eta, 60)
 256                 if eta_mins > 99:
 257                         return '--:--'
 258                 return '%02d:%02d' % (eta_mins, eta_secs)
 259
 260         @staticmethod
 261         def calc_speed(start, now, bytes):
 262                 dif = now - start
 263                 if bytes == 0 or dif < 0.001: # One millisecond
 264                         return '%10s' % '---b/s'
 265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 266
 267         @staticmethod
 268         def best_block_size(elapsed_time, bytes):
 269                 new_min = max(bytes / 2.0, 1.0)
 270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 271                 if elapsed_time < 0.001:
 272                         return long(new_max)
 273                 rate = bytes / elapsed_time
 274                 if rate > new_max:
 275                         return long(new_max)
 276                 if rate < new_min:
 277                         return long(new_min)
 278                 return long(rate)
 279
 280         @staticmethod
 281         def parse_bytes(bytestr):
 282                 """Parse a string indicating a byte quantity into a long integer."""
 283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 284                 if matchobj is None:
 285                         return None
 286                 number = float(matchobj.group(1))
 287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 288                 return long(round(number * multiplier))
 289
 290         @staticmethod
 291         def verify_url(url):
 292                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 293                 request = urllib2.Request(url, None, std_headers)
 294                 data = urllib2.urlopen(request)
 295                 data.read(1)
 296                 url = data.geturl()
 297                 data.close()
 298                 return url
 299
 300         def add_info_extractor(self, ie):
 301                 """Add an InfoExtractor object to the end of the list."""
 302                 self._ies.append(ie)
 303                 ie.set_downloader(self)
 304
 305         def add_post_processor(self, pp):
 306                 """Add a PostProcessor object to the end of the chain."""
 307                 self._pps.append(pp)
 308                 pp.set_downloader(self)
 309
 310         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 311                 """Print message to stdout if not in quiet mode."""
 312                 try:
 313                         if not self.params.get('quiet', False):
 314                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 315                         sys.stdout.flush()
 316                 except (UnicodeEncodeError), err:
 317                         if not ignore_encoding_errors:
 318                                 raise
 319
 320         def to_stderr(self, message):
 321                 """Print message to stderr."""
 322                 print >>sys.stderr, message.encode(preferredencoding())
 323
 324         def fixed_template(self):
 325                 """Checks if the output template is fixed."""
 326                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 327
 328         def trouble(self, message=None):
 329                 """Determine action to take when a download problem appears.
 330
 331                 Depending on if the downloader has been configured to ignore
 332                 download errors or not, this method may throw an exception or
 333                 not when errors are found, after printing the message.
 334                 """
 335                 if message is not None:
 336                         self.to_stderr(message)
 337                 if not self.params.get('ignoreerrors', False):
 338                         raise DownloadError(message)
 339                 self._download_retcode = 1
 340
 341         def slow_down(self, start_time, byte_counter):
 342                 """Sleep if the download speed is over the rate limit."""
 343                 rate_limit = self.params.get('ratelimit', None)
 344                 if rate_limit is None or byte_counter == 0:
 345                         return
 346                 now = time.time()
 347                 elapsed = now - start_time
 348                 if elapsed <= 0.0:
 349                         return
 350                 speed = float(byte_counter) / elapsed
 351                 if speed > rate_limit:
 352                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 353
 354         def report_destination(self, filename):
 355                 """Report destination filename."""
 356                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 357
 358         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 359                 """Report download progress."""
 360                 if self.params.get('noprogress', False):
 361                         return
 362                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 363                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 364
 365         def report_resuming_byte(self, resume_len):
 366                 """Report attemtp to resume at given byte."""
 367                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 368
 369         def report_retry(self, count, retries):
 370                 """Report retry in case of HTTP error 503"""
 371                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
 372
 373         def report_file_already_downloaded(self, file_name):
 374                 """Report file has already been fully downloaded."""
 375                 try:
 376                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 377                 except (UnicodeEncodeError), err:
 378                         self.to_stdout(u'[download] The file has already been downloaded')
 379
 380         def report_unable_to_resume(self):
 381                 """Report it was impossible to resume download."""
 382                 self.to_stdout(u'[download] Unable to resume')
 383
 384         def report_finish(self):
 385                 """Report download finished."""
 386                 if self.params.get('noprogress', False):
 387                         self.to_stdout(u'[download] Download completed')
 388                 else:
 389                         self.to_stdout(u'')
 390
 391         def increment_downloads(self):
 392                 """Increment the ordinal that assigns a number to each file."""
 393                 self._num_downloads += 1
 394
 395         def process_info(self, info_dict):
 396                 """Process a single dictionary returned by an InfoExtractor."""
 397                 # Do nothing else if in simulate mode
 398                 if self.params.get('simulate', False):
 399                         # Verify URL if it's an HTTP one
 400                         if info_dict['url'].startswith('http'):
 401                                 try:
 402                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 403                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 404                                         raise UnavailableVideoError
 405
 406                         # Forced printings
 407                         if self.params.get('forcetitle', False):
 408                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 409                         if self.params.get('forceurl', False):
 410                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 411                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 412                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 413                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 414                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 415
 416                         return
 417
 418                 try:
 419                         template_dict = dict(info_dict)
 420                         template_dict['epoch'] = unicode(long(time.time()))
 421                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 422                         filename = self.params['outtmpl'] % template_dict
 423                 except (ValueError, KeyError), err:
 424                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 425                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 426                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 427                         return
 428
 429                 try:
 430                         self.pmkdir(filename)
 431                 except (OSError, IOError), err:
 432                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 433                         return
 434
 435                 try:
 436                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 437                 except (OSError, IOError), err:
 438                         raise UnavailableVideoError
 439                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 440                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 441                         return
 442                 except (ContentTooShortError, ), err:
 443                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 444                         return
 445
 446                 if success:
 447                         try:
 448                                 self.post_process(filename, info_dict)
 449                         except (PostProcessingError), err:
 450                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 451                                 return
 452
 453         def download(self, url_list):
 454                 """Download a given list of URLs."""
 455                 if len(url_list) > 1 and self.fixed_template():
 456                         raise SameFileError(self.params['outtmpl'])
 457
 458                 for url in url_list:
 459                         suitable_found = False
 460                         for ie in self._ies:
 461                                 # Go to next InfoExtractor if not suitable
 462                                 if not ie.suitable(url):
 463                                         continue
 464
 465                                 # Suitable InfoExtractor found
 466                                 suitable_found = True
 467
 468                                 # Extract information from URL and process it
 469                                 ie.extract(url)
 470
 471                                 # Suitable InfoExtractor had been found; go to next URL
 472                                 break
 473
 474                         if not suitable_found:
 475                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 476
 477                 return self._download_retcode
 478
 479         def post_process(self, filename, ie_info):
 480                 """Run the postprocessing chain on the given file."""
 481                 info = dict(ie_info)
 482                 info['filepath'] = filename
 483                 for pp in self._pps:
 484                         info = pp.run(info)
 485                         if info is None:
 486                                 break
 487
 488         def _download_with_rtmpdump(self, filename, url, player_url):
 489                 self.report_destination(filename)
 490
 491                 # Check for rtmpdump first
 492                 try:
 493                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 494                 except (OSError, IOError):
 495                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 496                         return False
 497
 498                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 499                 # the connection was interrumpted and resuming appears to be
 500                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 501                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 502                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 503                 while retval == 2 or retval == 1:
 504                         prevsize = os.path.getsize(filename)
 505                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 506                         time.sleep(5.0) # This seems to be needed
 507                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 508                         cursize = os.path.getsize(filename)
 509                         if prevsize == cursize and retval == 1:
 510                                 break
 511                 if retval == 0:
 512                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 513                         return True
 514                 else:
 515                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 516                         return False
 517
 518         def _do_download(self, filename, url, player_url):
 519                 # Attempt to download using rtmpdump
 520                 if url.startswith('rtmp'):
 521                         return self._download_with_rtmpdump(filename, url, player_url)
 522
 523                 stream = None
 524                 open_mode = 'wb'
 525                 basic_request = urllib2.Request(url, None, std_headers)
 526                 request = urllib2.Request(url, None, std_headers)
 527
 528                 # Establish possible resume length
 529                 if os.path.isfile(filename):
 530                         resume_len = os.path.getsize(filename)
 531                 else:
 532                         resume_len = 0
 533
 534                 # Request parameters in case of being able to resume
 535                 if self.params.get('continuedl', False) and resume_len != 0:
 536                         self.report_resuming_byte(resume_len)
 537                         request.add_header('Range','bytes=%d-' % resume_len)
 538                         open_mode = 'ab'
 539
 540                 count = 0
 541                 retries = self.params.get('retries', 0)
 542                 while True:
 543                         # Establish connection
 544                         try:
 545                                 data = urllib2.urlopen(request)
 546                                 break
 547                         except (urllib2.HTTPError, ), err:
 548                                 if err.code == 503:
 549                                         # Retry in case of HTTP error 503
 550                                         count += 1
 551                                         if count <= retries:
 552                                                 self.report_retry(count, retries)
 553                                                 continue
 554                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
 555                                         raise
 556                                 # Unable to resume
 557                                 data = urllib2.urlopen(basic_request)
 558                                 content_length = data.info()['Content-Length']
 559
 560                                 if content_length is not None and long(content_length) == resume_len:
 561                                         # Because the file had already been fully downloaded
 562                                         self.report_file_already_downloaded(filename)
 563                                         return True
 564                                 else:
 565                                         # Because the server didn't let us
 566                                         self.report_unable_to_resume()
 567                                         open_mode = 'wb'
 568
 569                 data_len = data.info().get('Content-length', None)
 570                 data_len_str = self.format_bytes(data_len)
 571                 byte_counter = 0
 572                 block_size = 1024
 573                 start = time.time()
 574                 while True:
 575                         # Download and write
 576                         before = time.time()
 577                         data_block = data.read(block_size)
 578                         after = time.time()
 579                         data_block_len = len(data_block)
 580                         if data_block_len == 0:
 581                                 break
 582                         byte_counter += data_block_len
 583
 584                         # Open file just in time
 585                         if stream is None:
 586                                 try:
 587                                         (stream, filename) = sanitize_open(filename, open_mode)
 588                                         self.report_destination(filename)
 589                                 except (OSError, IOError), err:
 590                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 591                                         return False
 592                         try:
 593                                 stream.write(data_block)
 594                         except (IOError, OSError), err:
 595                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 596                         block_size = self.best_block_size(after - before, data_block_len)
 597
 598                         # Progress message
 599                         percent_str = self.calc_percent(byte_counter, data_len)
 600                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 601                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 602                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 603
 604                         # Apply rate limit
 605                         self.slow_down(start, byte_counter)
 606
 607                 self.report_finish()
 608                 if data_len is not None and str(byte_counter) != data_len:
 609                         raise ContentTooShortError(byte_counter, long(data_len))
 610                 return True
 611
 612 class InfoExtractor(object):
 613         """Information Extractor class.
 614
 615         Information extractors are the classes that, given a URL, extract
 616         information from the video (or videos) the URL refers to. This
 617         information includes the real video URL, the video title and simplified
 618         title, author and others. The information is stored in a dictionary
 619         which is then passed to the FileDownloader. The FileDownloader
 620         processes this information possibly downloading the video to the file
 621         system, among other possible outcomes. The dictionaries must include
 622         the following fields:
 623
 624         id:             Video identifier.
 625         url:            Final video URL.
 626         uploader:       Nickname of the video uploader.
 627         title:          Literal title.
 628         stitle:         Simplified title.
 629         ext:            Video filename extension.
 630         format:         Video format.
 631         player_url:     SWF Player URL (may be None).
 632
 633         The following fields are optional. Their primary purpose is to allow
 634         youtube-dl to serve as the backend for a video search function, such
 635         as the one in youtube2mp3.  They are only used when their respective
 636         forced printing functions are called:
 637
 638         thumbnail:      Full URL to a video thumbnail image.
 639         description:    One-line video description.
 640
 641         Subclasses of this one should re-define the _real_initialize() and
 642         _real_extract() methods, as well as the suitable() static method.
 643         Probably, they should also be instantiated and added to the main
 644         downloader.
 645         """
 646
 647         _ready = False
 648         _downloader = None
 649
 650         def __init__(self, downloader=None):
 651                 """Constructor. Receives an optional downloader."""
 652                 self._ready = False
 653                 self.set_downloader(downloader)
 654
 655         @staticmethod
 656         def suitable(url):
 657                 """Receives a URL and returns True if suitable for this IE."""
 658                 return False
 659
 660         def initialize(self):
 661                 """Initializes an instance (authentication, etc)."""
 662                 if not self._ready:
 663                         self._real_initialize()
 664                         self._ready = True
 665
 666         def extract(self, url):
 667                 """Extracts URL information and returns it in list of dicts."""
 668                 self.initialize()
 669                 return self._real_extract(url)
 670
 671         def set_downloader(self, downloader):
 672                 """Sets the downloader for this IE."""
 673                 self._downloader = downloader
 674
 675         def _real_initialize(self):
 676                 """Real initialization process. Redefine in subclasses."""
 677                 pass
 678
 679         def _real_extract(self, url):
 680                 """Real extraction process. Redefine in subclasses."""
 681                 pass
 682
 683 class YoutubeIE(InfoExtractor):
 684         """Information extractor for youtube.com."""
 685
 686         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
 687         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 688         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 689         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 690         _NETRC_MACHINE = 'youtube'
 691         # Listed in order of quality
 692         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 693         _video_extensions = {
 694                 '13': '3gp',
 695                 '17': 'mp4',
 696                 '18': 'mp4',
 697                 '22': 'mp4',
 698                 '37': 'mp4',
 699                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 700                 '43': 'webm',
 701                 '45': 'webm',
 702         }
 703
 704         @staticmethod
 705         def suitable(url):
 706                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 707
 708         def report_lang(self):
 709                 """Report attempt to set language."""
 710                 self._downloader.to_stdout(u'[youtube] Setting language')
 711
 712         def report_login(self):
 713                 """Report attempt to log in."""
 714                 self._downloader.to_stdout(u'[youtube] Logging in')
 715
 716         def report_age_confirmation(self):
 717                 """Report attempt to confirm age."""
 718                 self._downloader.to_stdout(u'[youtube] Confirming age')
 719
 720         def report_video_webpage_download(self, video_id):
 721                 """Report attempt to download video webpage."""
 722                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 723
 724         def report_video_info_webpage_download(self, video_id):
 725                 """Report attempt to download video info webpage."""
 726                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 727
 728         def report_information_extraction(self, video_id):
 729                 """Report attempt to extract video information."""
 730                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 731
 732         def report_unavailable_format(self, video_id, format):
 733                 """Report extracted video URL."""
 734                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 735
 736         def report_rtmp_download(self):
 737                 """Indicate the download will use the RTMP protocol."""
 738                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 739
 740         def _real_initialize(self):
 741                 if self._downloader is None:
 742                         return
 743
 744                 username = None
 745                 password = None
 746                 downloader_params = self._downloader.params
 747
 748                 # Attempt to use provided username and password or .netrc data
 749                 if downloader_params.get('username', None) is not None:
 750                         username = downloader_params['username']
 751                         password = downloader_params['password']
 752                 elif downloader_params.get('usenetrc', False):
 753                         try:
 754                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 755                                 if info is not None:
 756                                         username = info[0]
 757                                         password = info[2]
 758                                 else:
 759                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 760                         except (IOError, netrc.NetrcParseError), err:
 761                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 762                                 return
 763
 764                 # Set language
 765                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 766                 try:
 767                         self.report_lang()
 768                         urllib2.urlopen(request).read()
 769                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 770                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 771                         return
 772
 773                 # No authentication to be performed
 774                 if username is None:
 775                         return
 776
 777                 # Log in
 778                 login_form = {
 779                                 'current_form': 'loginForm',
 780                                 'next':         '/',
 781                                 'action_login': 'Log In',
 782                                 'username':     username,
 783                                 'password':     password,
 784                                 }
 785                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 786                 try:
 787                         self.report_login()
 788                         login_results = urllib2.urlopen(request).read()
 789                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 790                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 791                                 return
 792                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 793                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 794                         return
 795
 796                 # Confirm age
 797                 age_form = {
 798                                 'next_url':             '/',
 799                                 'action_confirm':       'Confirm',
 800                                 }
 801                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 802                 try:
 803                         self.report_age_confirmation()
 804                         age_results = urllib2.urlopen(request).read()
 805                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 806                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 807                         return
 808
 809         def _real_extract(self, url):
 810                 # Extract video id from URL
 811                 mobj = re.match(self._VALID_URL, url)
 812                 if mobj is None:
 813                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 814                         return
 815                 video_id = mobj.group(2)
 816
 817                 # Get video webpage
 818                 self.report_video_webpage_download(video_id)
 819                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 820                 try:
 821                         video_webpage = urllib2.urlopen(request).read()
 822                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 823                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 824                         return
 825
 826                 # Attempt to extract SWF player URL
 827                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
 828                 if mobj is not None:
 829                         player_url = mobj.group(1)
 830                 else:
 831                         player_url = None
 832
 833                 # Get video info
 834                 self.report_video_info_webpage_download(video_id)
 835                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 836                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 837                                            % (video_id, el_type))
 838                         request = urllib2.Request(video_info_url, None, std_headers)
 839                         try:
 840                                 video_info_webpage = urllib2.urlopen(request).read()
 841                                 video_info = parse_qs(video_info_webpage)
 842                                 if 'token' in video_info:
 843                                         break
 844                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 845                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 846                                 return
 847                 self.report_information_extraction(video_id)
 848
 849                 # uploader
 850                 if 'author' not in video_info:
 851                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 852                         return
 853                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 854
 855                 # title
 856                 if 'title' not in video_info:
 857                         self._downloader.trouble(u'ERROR: unable to extract video title')
 858                         return
 859                 video_title = urllib.unquote_plus(video_info['title'][0])
 860                 video_title = video_title.decode('utf-8')
 861                 video_title = sanitize_title(video_title)
 862
 863                 # simplified title
 864                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 865                 simple_title = simple_title.strip(ur'_')
 866
 867                 # thumbnail image
 868                 if 'thumbnail_url' not in video_info:
 869                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 870                         video_thumbnail = ''
 871                 else:   # don't panic if we can't find it
 872                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 873
 874                 # description
 875                 video_description = 'No description available.'
 876                 if self._downloader.params.get('forcedescription', False):
 877                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 878                         if mobj is not None:
 879                                 video_description = mobj.group(1)
 880
 881                 # Decide which formats to download
 882                 if 'fmt_url_map' in video_info:
 883                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
 884                         format_limit = self._downloader.params.get('format_limit', None)
 885                         if format_limit is not None and format_limit in self._available_formats:
 886                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
 887                         else:
 888                                 format_list = self._available_formats
 889                         existing_formats = [x for x in format_list if x in url_map]
 890                         if len(existing_formats) == 0:
 891                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 892                                 return
 893                         requested_format = self._downloader.params.get('format', None)
 894                         if requested_format is None:
 895                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 896                         elif requested_format == '-1':
 897                                 video_url_list = url_map.items() # All formats
 898                         else:
 899                                 if requested_format not in existing_formats:
 900                                         self._downloader.trouble(u'ERROR: format not available for video')
 901                                         return
 902                                 video_url_list = [(requested_format, url_map[requested_format])] # Specific format
 903                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 904                         self.report_rtmp_download()
 905                         video_url_list = [(None, video_info['conn'][0])]
 906                 else:
 907                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
 908                         return
 909
 910                 for format_param, video_real_url in video_url_list:
 911                         # At this point we have a new video
 912                         self._downloader.increment_downloads()
 913
 914                         # Extension
 915                         video_extension = self._video_extensions.get(format_param, 'flv')
 916
 917                         # Find the video URL in fmt_url_map or conn paramters
 918                         try:
 919                                 # Process video information
 920                                 self._downloader.process_info({
 921                                         'id':           video_id.decode('utf-8'),
 922                                         'url':          video_real_url.decode('utf-8'),
 923                                         'uploader':     video_uploader.decode('utf-8'),
 924                                         'title':        video_title,
 925                                         'stitle':       simple_title,
 926                                         'ext':          video_extension.decode('utf-8'),
 927                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 928                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 929                                         'description':  video_description.decode('utf-8'),
 930                                         'player_url':   player_url,
 931                                 })
 932                         except UnavailableVideoError, err:
 933                                 self._downloader.trouble(u'ERROR: unable to download video')
 934
 935
 936 class MetacafeIE(InfoExtractor):
 937         """Information Extractor for metacafe.com."""
 938
 939         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 940         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 941         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 942         _youtube_ie = None
 943
 944         def __init__(self, youtube_ie, downloader=None):
 945                 InfoExtractor.__init__(self, downloader)
 946                 self._youtube_ie = youtube_ie
 947
 948         @staticmethod
 949         def suitable(url):
 950                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 951
 952         def report_disclaimer(self):
 953                 """Report disclaimer retrieval."""
 954                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 955
 956         def report_age_confirmation(self):
 957                 """Report attempt to confirm age."""
 958                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 959
 960         def report_download_webpage(self, video_id):
 961                 """Report webpage download."""
 962                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 963
 964         def report_extraction(self, video_id):
 965                 """Report information extraction."""
 966                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 967
 968         def _real_initialize(self):
 969                 # Retrieve disclaimer
 970                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 971                 try:
 972                         self.report_disclaimer()
 973                         disclaimer = urllib2.urlopen(request).read()
 974                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 975                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 976                         return
 977
 978                 # Confirm age
 979                 disclaimer_form = {
 980                         'filters': '0',
 981                         'submit': "Continue - I'm over 18",
 982                         }
 983                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 984                 try:
 985                         self.report_age_confirmation()
 986                         disclaimer = urllib2.urlopen(request).read()
 987                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 988                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 989                         return
 990
 991         def _real_extract(self, url):
 992                 # Extract id and simplified title from URL
 993                 mobj = re.match(self._VALID_URL, url)
 994                 if mobj is None:
 995                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 996                         return
 997
 998                 video_id = mobj.group(1)
 999
1000                 # Check if video comes from YouTube
1001                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1002                 if mobj2 is not None:
1003                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1004                         return
1005
1006                 # At this point we have a new video
1007                 self._downloader.increment_downloads()
1008
1009                 simple_title = mobj.group(2).decode('utf-8')
1010                 video_extension = 'flv'
1011
1012                 # Retrieve video webpage to extract further information
1013                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1014                 try:
1015                         self.report_download_webpage(video_id)
1016                         webpage = urllib2.urlopen(request).read()
1017                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1018                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1019                         return
1020
1021                 # Extract URL, uploader and title from webpage
1022                 self.report_extraction(video_id)
1023                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1024                 if mobj is None:
1025                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1026                         return
1027                 mediaURL = urllib.unquote(mobj.group(1))
1028
1029                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1030                 #if mobj is None:
1031                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1032                 #       return
1033                 #gdaKey = mobj.group(1)
1034                 #
1035                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1036
1037                 video_url = mediaURL
1038
1039                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1040                 if mobj is None:
1041                         self._downloader.trouble(u'ERROR: unable to extract title')
1042                         return
1043                 video_title = mobj.group(1).decode('utf-8')
1044                 video_title = sanitize_title(video_title)
1045
1046                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1047                 if mobj is None:
1048                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1049                         return
1050                 video_uploader = mobj.group(1)
1051
1052                 try:
1053                         # Process video information
1054                         self._downloader.process_info({
1055                                 'id':           video_id.decode('utf-8'),
1056                                 'url':          video_url.decode('utf-8'),
1057                                 'uploader':     video_uploader.decode('utf-8'),
1058                                 'title':        video_title,
1059                                 'stitle':       simple_title,
1060                                 'ext':          video_extension.decode('utf-8'),
1061                                 'format':       u'NA',
1062                                 'player_url':   None,
1063                         })
1064                 except UnavailableVideoError:
1065                         self._downloader.trouble(u'ERROR: unable to download video')
1066
1067
1068 class DailymotionIE(InfoExtractor):
1069         """Information Extractor for Dailymotion"""
1070
1071         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1072
1073         def __init__(self, downloader=None):
1074                 InfoExtractor.__init__(self, downloader)
1075
1076         @staticmethod
1077         def suitable(url):
1078                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1079
1080         def report_download_webpage(self, video_id):
1081                 """Report webpage download."""
1082                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1083
1084         def report_extraction(self, video_id):
1085                 """Report information extraction."""
1086                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1087
1088         def _real_initialize(self):
1089                 return
1090
1091         def _real_extract(self, url):
1092                 # Extract id and simplified title from URL
1093                 mobj = re.match(self._VALID_URL, url)
1094                 if mobj is None:
1095                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1096                         return
1097
1098                 # At this point we have a new video
1099                 self._downloader.increment_downloads()
1100                 video_id = mobj.group(1)
1101
1102                 simple_title = mobj.group(2).decode('utf-8')
1103                 video_extension = 'flv'
1104
1105                 # Retrieve video webpage to extract further information
1106                 request = urllib2.Request(url)
1107                 try:
1108                         self.report_download_webpage(video_id)
1109                         webpage = urllib2.urlopen(request).read()
1110                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1111                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1112                         return
1113
1114                 # Extract URL, uploader and title from webpage
1115                 self.report_extraction(video_id)
1116                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1117                 if mobj is None:
1118                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1119                         return
1120                 mediaURL = urllib.unquote(mobj.group(1))
1121
1122                 # if needed add http://www.dailymotion.com/ if relative URL
1123
1124                 video_url = mediaURL
1125
1126                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1127                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1128                 if mobj is None:
1129                         self._downloader.trouble(u'ERROR: unable to extract title')
1130                         return
1131                 video_title = mobj.group(1).decode('utf-8')
1132                 video_title = sanitize_title(video_title)
1133
1134                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1135                 if mobj is None:
1136                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1137                         return
1138                 video_uploader = mobj.group(1)
1139
1140                 try:
1141                         # Process video information
1142                         self._downloader.process_info({
1143                                 'id':           video_id.decode('utf-8'),
1144                                 'url':          video_url.decode('utf-8'),
1145                                 'uploader':     video_uploader.decode('utf-8'),
1146                                 'title':        video_title,
1147                                 'stitle':       simple_title,
1148                                 'ext':          video_extension.decode('utf-8'),
1149                                 'format':       u'NA',
1150                                 'player_url':   None,
1151                         })
1152                 except UnavailableVideoError:
1153                         self._downloader.trouble(u'ERROR: unable to download video')
1154
1155 class GoogleIE(InfoExtractor):
1156         """Information extractor for video.google.com."""
1157
1158         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1159
1160         def __init__(self, downloader=None):
1161                 InfoExtractor.__init__(self, downloader)
1162
1163         @staticmethod
1164         def suitable(url):
1165                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1166
1167         def report_download_webpage(self, video_id):
1168                 """Report webpage download."""
1169                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1170
1171         def report_extraction(self, video_id):
1172                 """Report information extraction."""
1173                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1174
1175         def _real_initialize(self):
1176                 return
1177
1178         def _real_extract(self, url):
1179                 # Extract id from URL
1180                 mobj = re.match(self._VALID_URL, url)
1181                 if mobj is None:
1182                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183                         return
1184
1185                 # At this point we have a new video
1186                 self._downloader.increment_downloads()
1187                 video_id = mobj.group(1)
1188
1189                 video_extension = 'mp4'
1190
1191                 # Retrieve video webpage to extract further information
1192                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1193                 try:
1194                         self.report_download_webpage(video_id)
1195                         webpage = urllib2.urlopen(request).read()
1196                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1197                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1198                         return
1199
1200                 # Extract URL, uploader, and title from webpage
1201                 self.report_extraction(video_id)
1202                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1203                 if mobj is None:
1204                         video_extension = 'flv'
1205                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1206                 if mobj is None:
1207                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1208                         return
1209                 mediaURL = urllib.unquote(mobj.group(1))
1210                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1211                 mediaURL = mediaURL.replace('\\x26', '\x26')
1212
1213                 video_url = mediaURL
1214
1215                 mobj = re.search(r'<title>(.*)</title>', webpage)
1216                 if mobj is None:
1217                         self._downloader.trouble(u'ERROR: unable to extract title')
1218                         return
1219                 video_title = mobj.group(1).decode('utf-8')
1220                 video_title = sanitize_title(video_title)
1221                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1222
1223                 # Extract video description
1224                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1225                 if mobj is None:
1226                         self._downloader.trouble(u'ERROR: unable to extract video description')
1227                         return
1228                 video_description = mobj.group(1).decode('utf-8')
1229                 if not video_description:
1230                         video_description = 'No description available.'
1231
1232                 # Extract video thumbnail
1233                 if self._downloader.params.get('forcethumbnail', False):
1234                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1235                         try:
1236                                 webpage = urllib2.urlopen(request).read()
1237                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1238                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1239                                 return
1240                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1241                         if mobj is None:
1242                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1243                                 return
1244                         video_thumbnail = mobj.group(1)
1245                 else:   # we need something to pass to process_info
1246                         video_thumbnail = ''
1247
1248
1249                 try:
1250                         # Process video information
1251                         self._downloader.process_info({
1252                                 'id':           video_id.decode('utf-8'),
1253                                 'url':          video_url.decode('utf-8'),
1254                                 'uploader':     u'NA',
1255                                 'title':        video_title,
1256                                 'stitle':       simple_title,
1257                                 'ext':          video_extension.decode('utf-8'),
1258                                 'format':       u'NA',
1259                                 'player_url':   None,
1260                         })
1261                 except UnavailableVideoError:
1262                         self._downloader.trouble(u'ERROR: unable to download video')
1263
1264
1265 class PhotobucketIE(InfoExtractor):
1266         """Information extractor for photobucket.com."""
1267
1268         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1269
1270         def __init__(self, downloader=None):
1271                 InfoExtractor.__init__(self, downloader)
1272
1273         @staticmethod
1274         def suitable(url):
1275                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1276
1277         def report_download_webpage(self, video_id):
1278                 """Report webpage download."""
1279                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1280
1281         def report_extraction(self, video_id):
1282                 """Report information extraction."""
1283                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1284
1285         def _real_initialize(self):
1286                 return
1287
1288         def _real_extract(self, url):
1289                 # Extract id from URL
1290                 mobj = re.match(self._VALID_URL, url)
1291                 if mobj is None:
1292                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1293                         return
1294
1295                 # At this point we have a new video
1296                 self._downloader.increment_downloads()
1297                 video_id = mobj.group(1)
1298
1299                 video_extension = 'flv'
1300
1301                 # Retrieve video webpage to extract further information
1302                 request = urllib2.Request(url)
1303                 try:
1304                         self.report_download_webpage(video_id)
1305                         webpage = urllib2.urlopen(request).read()
1306                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1307                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1308                         return
1309
1310                 # Extract URL, uploader, and title from webpage
1311                 self.report_extraction(video_id)
1312                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1313                 if mobj is None:
1314                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1315                         return
1316                 mediaURL = urllib.unquote(mobj.group(1))
1317
1318                 video_url = mediaURL
1319
1320                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1321                 if mobj is None:
1322                         self._downloader.trouble(u'ERROR: unable to extract title')
1323                         return
1324                 video_title = mobj.group(1).decode('utf-8')
1325                 video_title = sanitize_title(video_title)
1326                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1327
1328                 video_uploader = mobj.group(2).decode('utf-8')
1329
1330                 try:
1331                         # Process video information
1332                         self._downloader.process_info({
1333                                 'id':           video_id.decode('utf-8'),
1334                                 'url':          video_url.decode('utf-8'),
1335                                 'uploader':     video_uploader,
1336                                 'title':        video_title,
1337                                 'stitle':       simple_title,
1338                                 'ext':          video_extension.decode('utf-8'),
1339                                 'format':       u'NA',
1340                                 'player_url':   None,
1341                         })
1342                 except UnavailableVideoError:
1343                         self._downloader.trouble(u'ERROR: unable to download video')
1344
1345
1346 class YahooIE(InfoExtractor):
1347         """Information extractor for video.yahoo.com."""
1348
1349         # _VALID_URL matches all Yahoo! Video URLs
1350         # _VPAGE_URL matches only the extractable '/watch/' URLs
1351         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1352         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1353
1354         def __init__(self, downloader=None):
1355                 InfoExtractor.__init__(self, downloader)
1356
1357         @staticmethod
1358         def suitable(url):
1359                 return (re.match(YahooIE._VALID_URL, url) is not None)
1360
1361         def report_download_webpage(self, video_id):
1362                 """Report webpage download."""
1363                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1364
1365         def report_extraction(self, video_id):
1366                 """Report information extraction."""
1367                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1368
1369         def _real_initialize(self):
1370                 return
1371
1372         def _real_extract(self, url, new_video=True):
1373                 # Extract ID from URL
1374                 mobj = re.match(self._VALID_URL, url)
1375                 if mobj is None:
1376                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377                         return
1378
1379                 # At this point we have a new video
1380                 self._downloader.increment_downloads()
1381                 video_id = mobj.group(2)
1382                 video_extension = 'flv'
1383
1384                 # Rewrite valid but non-extractable URLs as
1385                 # extractable English language /watch/ URLs
1386                 if re.match(self._VPAGE_URL, url) is None:
1387                         request = urllib2.Request(url)
1388                         try:
1389                                 webpage = urllib2.urlopen(request).read()
1390                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1391                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1392                                 return
1393
1394                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1395                         if mobj is None:
1396                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1397                                 return
1398                         yahoo_id = mobj.group(1)
1399
1400                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1401                         if mobj is None:
1402                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1403                                 return
1404                         yahoo_vid = mobj.group(1)
1405
1406                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1407                         return self._real_extract(url, new_video=False)
1408
1409                 # Retrieve video webpage to extract further information
1410                 request = urllib2.Request(url)
1411                 try:
1412                         self.report_download_webpage(video_id)
1413                         webpage = urllib2.urlopen(request).read()
1414                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1415                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1416                         return
1417
1418                 # Extract uploader and title from webpage
1419                 self.report_extraction(video_id)
1420                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1421                 if mobj is None:
1422                         self._downloader.trouble(u'ERROR: unable to extract video title')
1423                         return
1424                 video_title = mobj.group(1).decode('utf-8')
1425                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1426
1427                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1428                 if mobj is None:
1429                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1430                         return
1431                 video_uploader = mobj.group(1).decode('utf-8')
1432
1433                 # Extract video thumbnail
1434                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1435                 if mobj is None:
1436                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1437                         return
1438                 video_thumbnail = mobj.group(1).decode('utf-8')
1439
1440                 # Extract video description
1441                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1442                 if mobj is None:
1443                         self._downloader.trouble(u'ERROR: unable to extract video description')
1444                         return
1445                 video_description = mobj.group(1).decode('utf-8')
1446                 if not video_description: video_description = 'No description available.'
1447
1448                 # Extract video height and width
1449                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1450                 if mobj is None:
1451                         self._downloader.trouble(u'ERROR: unable to extract video height')
1452                         return
1453                 yv_video_height = mobj.group(1)
1454
1455                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: unable to extract video width')
1458                         return
1459                 yv_video_width = mobj.group(1)
1460
1461                 # Retrieve video playlist to extract media URL
1462                 # I'm not completely sure what all these options are, but we
1463                 # seem to need most of them, otherwise the server sends a 401.
1464                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1465                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1466                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1467                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1468                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1469                 try:
1470                         self.report_download_webpage(video_id)
1471                         webpage = urllib2.urlopen(request).read()
1472                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1473                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1474                         return
1475
1476                 # Extract media URL from playlist XML
1477                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1478                 if mobj is None:
1479                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1480                         return
1481                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1482                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1483
1484                 try:
1485                         # Process video information
1486                         self._downloader.process_info({
1487                                 'id':           video_id.decode('utf-8'),
1488                                 'url':          video_url,
1489                                 'uploader':     video_uploader,
1490                                 'title':        video_title,
1491                                 'stitle':       simple_title,
1492                                 'ext':          video_extension.decode('utf-8'),
1493                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1494                                 'description':  video_description,
1495                                 'thumbnail':    video_thumbnail,
1496                                 'description':  video_description,
1497                                 'player_url':   None,
1498                         })
1499                 except UnavailableVideoError:
1500                         self._downloader.trouble(u'ERROR: unable to download video')
1501
1502
1503 class GenericIE(InfoExtractor):
1504         """Generic last-resort information extractor."""
1505
1506         def __init__(self, downloader=None):
1507                 InfoExtractor.__init__(self, downloader)
1508
1509         @staticmethod
1510         def suitable(url):
1511                 return True
1512
1513         def report_download_webpage(self, video_id):
1514                 """Report webpage download."""
1515                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1516                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1517
1518         def report_extraction(self, video_id):
1519                 """Report information extraction."""
1520                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1521
1522         def _real_initialize(self):
1523                 return
1524
1525         def _real_extract(self, url):
1526                 # At this point we have a new video
1527                 self._downloader.increment_downloads()
1528
1529                 video_id = url.split('/')[-1]
1530                 request = urllib2.Request(url)
1531                 try:
1532                         self.report_download_webpage(video_id)
1533                         webpage = urllib2.urlopen(request).read()
1534                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1535                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1536                         return
1537                 except ValueError, err:
1538                         # since this is the last-resort InfoExtractor, if
1539                         # this error is thrown, it'll be thrown here
1540                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1541                         return
1542
1543                 # Start with something easy: JW Player in SWFObject
1544                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1545                 if mobj is None:
1546                         # Broaden the search a little bit
1547                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1548                 if mobj is None:
1549                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1550                         return
1551
1552                 # It's possible that one of the regexes
1553                 # matched, but returned an empty group:
1554                 if mobj.group(1) is None:
1555                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1556                         return
1557
1558                 video_url = urllib.unquote(mobj.group(1))
1559                 video_id  = os.path.basename(video_url)
1560
1561                 # here's a fun little line of code for you:
1562                 video_extension = os.path.splitext(video_id)[1][1:]
1563                 video_id        = os.path.splitext(video_id)[0]
1564
1565                 # it's tempting to parse this further, but you would
1566                 # have to take into account all the variations like
1567                 #   Video Title - Site Name
1568                 #   Site Name | Video Title
1569                 #   Video Title - Tagline | Site Name
1570                 # and so on and so forth; it's just not practical
1571                 mobj = re.search(r'<title>(.*)</title>', webpage)
1572                 if mobj is None:
1573                         self._downloader.trouble(u'ERROR: unable to extract title')
1574                         return
1575                 video_title = mobj.group(1).decode('utf-8')
1576                 video_title = sanitize_title(video_title)
1577                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1578
1579                 # video uploader is domain name
1580                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1581                 if mobj is None:
1582                         self._downloader.trouble(u'ERROR: unable to extract title')
1583                         return
1584                 video_uploader = mobj.group(1).decode('utf-8')
1585
1586                 try:
1587                         # Process video information
1588                         self._downloader.process_info({
1589                                 'id':           video_id.decode('utf-8'),
1590                                 'url':          video_url.decode('utf-8'),
1591                                 'uploader':     video_uploader,
1592                                 'title':        video_title,
1593                                 'stitle':       simple_title,
1594                                 'ext':          video_extension.decode('utf-8'),
1595                                 'format':       u'NA',
1596                                 'player_url':   None,
1597                         })
1598                 except UnavailableVideoError, err:
1599                         self._downloader.trouble(u'ERROR: unable to download video')
1600
1601
1602 class YoutubeSearchIE(InfoExtractor):
1603         """Information Extractor for YouTube search queries."""
1604         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1605         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1606         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1607         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1608         _youtube_ie = None
1609         _max_youtube_results = 1000
1610
1611         def __init__(self, youtube_ie, downloader=None):
1612                 InfoExtractor.__init__(self, downloader)
1613                 self._youtube_ie = youtube_ie
1614
1615         @staticmethod
1616         def suitable(url):
1617                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1618
1619         def report_download_page(self, query, pagenum):
1620                 """Report attempt to download playlist page with given number."""
1621                 query = query.decode(preferredencoding())
1622                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1623
1624         def _real_initialize(self):
1625                 self._youtube_ie.initialize()
1626
1627         def _real_extract(self, query):
1628                 mobj = re.match(self._VALID_QUERY, query)
1629                 if mobj is None:
1630                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1631                         return
1632
1633                 prefix, query = query.split(':')
1634                 prefix = prefix[8:]
1635                 query  = query.encode('utf-8')
1636                 if prefix == '':
1637                         self._download_n_results(query, 1)
1638                         return
1639                 elif prefix == 'all':
1640                         self._download_n_results(query, self._max_youtube_results)
1641                         return
1642                 else:
1643                         try:
1644                                 n = long(prefix)
1645                                 if n <= 0:
1646                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1647                                         return
1648                                 elif n > self._max_youtube_results:
1649                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1650                                         n = self._max_youtube_results
1651                                 self._download_n_results(query, n)
1652                                 return
1653                         except ValueError: # parsing prefix as integer fails
1654                                 self._download_n_results(query, 1)
1655                                 return
1656
1657         def _download_n_results(self, query, n):
1658                 """Downloads a specified number of results for a query"""
1659
1660                 video_ids = []
1661                 already_seen = set()
1662                 pagenum = 1
1663
1664                 while True:
1665                         self.report_download_page(query, pagenum)
1666                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1667                         request = urllib2.Request(result_url, None, std_headers)
1668                         try:
1669                                 page = urllib2.urlopen(request).read()
1670                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1672                                 return
1673
1674                         # Extract video identifiers
1675                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1676                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1677                                 if video_id not in already_seen:
1678                                         video_ids.append(video_id)
1679                                         already_seen.add(video_id)
1680                                         if len(video_ids) == n:
1681                                                 # Specified n videos reached
1682                                                 for id in video_ids:
1683                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1684                                                 return
1685
1686                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1687                                 for id in video_ids:
1688                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1689                                 return
1690
1691                         pagenum = pagenum + 1
1692
1693 class GoogleSearchIE(InfoExtractor):
1694         """Information Extractor for Google Video search queries."""
1695         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1696         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1697         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1698         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1699         _google_ie = None
1700         _max_google_results = 1000
1701
1702         def __init__(self, google_ie, downloader=None):
1703                 InfoExtractor.__init__(self, downloader)
1704                 self._google_ie = google_ie
1705
1706         @staticmethod
1707         def suitable(url):
1708                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1709
1710         def report_download_page(self, query, pagenum):
1711                 """Report attempt to download playlist page with given number."""
1712                 query = query.decode(preferredencoding())
1713                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1714
1715         def _real_initialize(self):
1716                 self._google_ie.initialize()
1717
1718         def _real_extract(self, query):
1719                 mobj = re.match(self._VALID_QUERY, query)
1720                 if mobj is None:
1721                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1722                         return
1723
1724                 prefix, query = query.split(':')
1725                 prefix = prefix[8:]
1726                 query  = query.encode('utf-8')
1727                 if prefix == '':
1728                         self._download_n_results(query, 1)
1729                         return
1730                 elif prefix == 'all':
1731                         self._download_n_results(query, self._max_google_results)
1732                         return
1733                 else:
1734                         try:
1735                                 n = long(prefix)
1736                                 if n <= 0:
1737                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1738                                         return
1739                                 elif n > self._max_google_results:
1740                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1741                                         n = self._max_google_results
1742                                 self._download_n_results(query, n)
1743                                 return
1744                         except ValueError: # parsing prefix as integer fails
1745                                 self._download_n_results(query, 1)
1746                                 return
1747
1748         def _download_n_results(self, query, n):
1749                 """Downloads a specified number of results for a query"""
1750
1751                 video_ids = []
1752                 already_seen = set()
1753                 pagenum = 1
1754
1755                 while True:
1756                         self.report_download_page(query, pagenum)
1757                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1758                         request = urllib2.Request(result_url, None, std_headers)
1759                         try:
1760                                 page = urllib2.urlopen(request).read()
1761                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1762                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1763                                 return
1764
1765                         # Extract video identifiers
1766                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1767                                 video_id = mobj.group(1)
1768                                 if video_id not in already_seen:
1769                                         video_ids.append(video_id)
1770                                         already_seen.add(video_id)
1771                                         if len(video_ids) == n:
1772                                                 # Specified n videos reached
1773                                                 for id in video_ids:
1774                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1775                                                 return
1776
1777                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1778                                 for id in video_ids:
1779                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1780                                 return
1781
1782                         pagenum = pagenum + 1
1783
1784 class YahooSearchIE(InfoExtractor):
1785         """Information Extractor for Yahoo! Video search queries."""
1786         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1787         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1788         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1789         _MORE_PAGES_INDICATOR = r'\s*Next'
1790         _yahoo_ie = None
1791         _max_yahoo_results = 1000
1792
1793         def __init__(self, yahoo_ie, downloader=None):
1794                 InfoExtractor.__init__(self, downloader)
1795                 self._yahoo_ie = yahoo_ie
1796
1797         @staticmethod
1798         def suitable(url):
1799                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1800
1801         def report_download_page(self, query, pagenum):
1802                 """Report attempt to download playlist page with given number."""
1803                 query = query.decode(preferredencoding())
1804                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1805
1806         def _real_initialize(self):
1807                 self._yahoo_ie.initialize()
1808
1809         def _real_extract(self, query):
1810                 mobj = re.match(self._VALID_QUERY, query)
1811                 if mobj is None:
1812                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1813                         return
1814
1815                 prefix, query = query.split(':')
1816                 prefix = prefix[8:]
1817                 query  = query.encode('utf-8')
1818                 if prefix == '':
1819                         self._download_n_results(query, 1)
1820                         return
1821                 elif prefix == 'all':
1822                         self._download_n_results(query, self._max_yahoo_results)
1823                         return
1824                 else:
1825                         try:
1826                                 n = long(prefix)
1827                                 if n <= 0:
1828                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1829                                         return
1830                                 elif n > self._max_yahoo_results:
1831                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1832                                         n = self._max_yahoo_results
1833                                 self._download_n_results(query, n)
1834                                 return
1835                         except ValueError: # parsing prefix as integer fails
1836                                 self._download_n_results(query, 1)
1837                                 return
1838
1839         def _download_n_results(self, query, n):
1840                 """Downloads a specified number of results for a query"""
1841
1842                 video_ids = []
1843                 already_seen = set()
1844                 pagenum = 1
1845
1846                 while True:
1847                         self.report_download_page(query, pagenum)
1848                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1849                         request = urllib2.Request(result_url, None, std_headers)
1850                         try:
1851                                 page = urllib2.urlopen(request).read()
1852                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1853                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1854                                 return
1855
1856                         # Extract video identifiers
1857                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1858                                 video_id = mobj.group(1)
1859                                 if video_id not in already_seen:
1860                                         video_ids.append(video_id)
1861                                         already_seen.add(video_id)
1862                                         if len(video_ids) == n:
1863                                                 # Specified n videos reached
1864                                                 for id in video_ids:
1865                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1866                                                 return
1867
1868                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1869                                 for id in video_ids:
1870                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1871                                 return
1872
1873                         pagenum = pagenum + 1
1874
1875 class YoutubePlaylistIE(InfoExtractor):
1876         """Information Extractor for YouTube playlists."""
1877
1878         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1879         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1880         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1881         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1882         _youtube_ie = None
1883
1884         def __init__(self, youtube_ie, downloader=None):
1885                 InfoExtractor.__init__(self, downloader)
1886                 self._youtube_ie = youtube_ie
1887
1888         @staticmethod
1889         def suitable(url):
1890                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1891
1892         def report_download_page(self, playlist_id, pagenum):
1893                 """Report attempt to download playlist page with given number."""
1894                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1895
1896         def _real_initialize(self):
1897                 self._youtube_ie.initialize()
1898
1899         def _real_extract(self, url):
1900                 # Extract playlist id
1901                 mobj = re.match(self._VALID_URL, url)
1902                 if mobj is None:
1903                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1904                         return
1905
1906                 # Download playlist pages
1907                 playlist_id = mobj.group(1)
1908                 video_ids = []
1909                 pagenum = 1
1910
1911                 while True:
1912                         self.report_download_page(playlist_id, pagenum)
1913                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1914                         try:
1915                                 page = urllib2.urlopen(request).read()
1916                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1917                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1918                                 return
1919
1920                         # Extract video identifiers
1921                         ids_in_page = []
1922                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1923                                 if mobj.group(1) not in ids_in_page:
1924                                         ids_in_page.append(mobj.group(1))
1925                         video_ids.extend(ids_in_page)
1926
1927                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1928                                 break
1929                         pagenum = pagenum + 1
1930
1931                 for id in video_ids:
1932                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1933                 return
1934
1935 class YoutubeUserIE(InfoExtractor):
1936         """Information Extractor for YouTube users."""
1937
1938         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1939         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1940         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1941         _youtube_ie = None
1942
1943         def __init__(self, youtube_ie, downloader=None):
1944                 InfoExtractor.__init__(self, downloader)
1945                 self._youtube_ie = youtube_ie
1946
1947         @staticmethod
1948         def suitable(url):
1949                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1950
1951         def report_download_page(self, username):
1952                 """Report attempt to download user page."""
1953                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1954
1955         def _real_initialize(self):
1956                 self._youtube_ie.initialize()
1957
1958         def _real_extract(self, url):
1959                 # Extract username
1960                 mobj = re.match(self._VALID_URL, url)
1961                 if mobj is None:
1962                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1963                         return
1964
1965                 # Download user page
1966                 username = mobj.group(1)
1967                 video_ids = []
1968                 pagenum = 1
1969
1970                 self.report_download_page(username)
1971                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1972                 try:
1973                         page = urllib2.urlopen(request).read()
1974                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1975                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1976                         return
1977
1978                 # Extract video identifiers
1979                 ids_in_page = []
1980
1981                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1982                         if mobj.group(1) not in ids_in_page:
1983                                 ids_in_page.append(mobj.group(1))
1984                 video_ids.extend(ids_in_page)
1985
1986                 for id in video_ids:
1987                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1988                 return
1989
1990 class PostProcessor(object):
1991         """Post Processor class.
1992
1993         PostProcessor objects can be added to downloaders with their
1994         add_post_processor() method. When the downloader has finished a
1995         successful download, it will take its internal chain of PostProcessors
1996         and start calling the run() method on each one of them, first with
1997         an initial argument and then with the returned value of the previous
1998         PostProcessor.
1999
2000         The chain will be stopped if one of them ever returns None or the end
2001         of the chain is reached.
2002
2003         PostProcessor objects follow a "mutual registration" process similar
2004         to InfoExtractor objects.
2005         """
2006
2007         _downloader = None
2008
2009         def __init__(self, downloader=None):
2010                 self._downloader = downloader
2011
2012         def set_downloader(self, downloader):
2013                 """Sets the downloader for this PP."""
2014                 self._downloader = downloader
2015
2016         def run(self, information):
2017                 """Run the PostProcessor.
2018
2019                 The "information" argument is a dictionary like the ones
2020                 composed by InfoExtractors. The only difference is that this
2021                 one has an extra field called "filepath" that points to the
2022                 downloaded file.
2023
2024                 When this method returns None, the postprocessing chain is
2025                 stopped. However, this method may return an information
2026                 dictionary that will be passed to the next postprocessing
2027                 object in the chain. It can be the one it received after
2028                 changing some fields.
2029
2030                 In addition, this method may raise a PostProcessingError
2031                 exception that will be taken into account by the downloader
2032                 it was called from.
2033                 """
2034                 return information # by default, do nothing
2035
2036 ### MAIN PROGRAM ###
2037 if __name__ == '__main__':
2038         try:
2039                 # Modules needed only when running the main program
2040                 import getpass
2041                 import optparse
2042
2043                 # Function to update the program file with the latest version from bitbucket.org
2044                 def update_self(downloader, filename):
2045                         # Note: downloader only used for options
2046                         if not os.access (filename, os.W_OK):
2047                                 sys.exit('ERROR: no write permissions on %s' % filename)
2048
2049                         downloader.to_stdout('Updating to latest stable version...')
2050                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2051                         latest_version = urllib.urlopen(latest_url).read().strip()
2052                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2053                         newcontent = urllib.urlopen(prog_url).read()
2054                         stream = open(filename, 'w')
2055                         stream.write(newcontent)
2056                         stream.close()
2057                         downloader.to_stdout('Updated to version %s' % latest_version)
2058
2059                 # General configuration
2060                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2061                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2062                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2063
2064                 # Parse command line
2065                 parser = optparse.OptionParser(
2066                         usage='Usage: %prog [options] url...',
2067                         version='2010.07.22',
2068                         conflict_handler='resolve',
2069                 )
2070
2071                 parser.add_option('-h', '--help',
2072                                 action='help', help='print this help text and exit')
2073                 parser.add_option('-v', '--version',
2074                                 action='version', help='print program version and exit')
2075                 parser.add_option('-U', '--update',
2076                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2077                 parser.add_option('-i', '--ignore-errors',
2078                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2079                 parser.add_option('-r', '--rate-limit',
2080                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2081                 parser.add_option('-R', '--retries',
2082                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2083
2084                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2085                 authentication.add_option('-u', '--username',
2086                                 dest='username', metavar='USERNAME', help='account username')
2087                 authentication.add_option('-p', '--password',
2088                                 dest='password', metavar='PASSWORD', help='account password')
2089                 authentication.add_option('-n', '--netrc',
2090                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2091                 parser.add_option_group(authentication)
2092
2093                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2094                 video_format.add_option('-f', '--format',
2095                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2096                 video_format.add_option('-m', '--mobile-version',
2097                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2098                 video_format.add_option('--all-formats',
2099                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2100                 video_format.add_option('--max-quality',
2101                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2102                 parser.add_option_group(video_format)
2103
2104                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2105                 verbosity.add_option('-q', '--quiet',
2106                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2107                 verbosity.add_option('-s', '--simulate',
2108                                 action='store_true', dest='simulate', help='do not download video', default=False)
2109                 verbosity.add_option('-g', '--get-url',
2110                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2111                 verbosity.add_option('-e', '--get-title',
2112                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2113                 verbosity.add_option('--get-thumbnail',
2114                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2115                 verbosity.add_option('--get-description',
2116                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2117                 verbosity.add_option('--no-progress',
2118                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2119                 parser.add_option_group(verbosity)
2120
2121                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2122                 filesystem.add_option('-t', '--title',
2123                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2124                 filesystem.add_option('-l', '--literal',
2125                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2126                 filesystem.add_option('-o', '--output',
2127                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2128                 filesystem.add_option('-a', '--batch-file',
2129                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2130                 filesystem.add_option('-w', '--no-overwrites',
2131                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2132                 filesystem.add_option('-c', '--continue',
2133                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2134                 parser.add_option_group(filesystem)
2135
2136                 (opts, args) = parser.parse_args()
2137
2138                 # Batch file verification
2139                 batchurls = []
2140                 if opts.batchfile is not None:
2141                         try:
2142                                 if opts.batchfile == '-':
2143                                         batchfd = sys.stdin
2144                                 else:
2145                                         batchfd = open(opts.batchfile, 'r')
2146                                 batchurls = batchfd.readlines()
2147                                 batchurls = [x.strip() for x in batchurls]
2148                                 batchurls = [x for x in batchurls if len(x) > 0]
2149                         except IOError:
2150                                 sys.exit(u'ERROR: batch file could not be read')
2151                 all_urls = batchurls + args
2152
2153                 # Conflicting, missing and erroneous options
2154                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2155                         parser.error(u'using .netrc conflicts with giving username/password')
2156                 if opts.password is not None and opts.username is None:
2157                         parser.error(u'account username missing')
2158                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2159                         parser.error(u'using output template conflicts with using title or literal title')
2160                 if opts.usetitle and opts.useliteral:
2161                         parser.error(u'using title conflicts with using literal title')
2162                 if opts.username is not None and opts.password is None:
2163                         opts.password = getpass.getpass(u'Type account password and press return:')
2164                 if opts.ratelimit is not None:
2165                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2166                         if numeric_limit is None:
2167                                 parser.error(u'invalid rate limit specified')
2168                         opts.ratelimit = numeric_limit
2169                 if opts.retries is not None:
2170                         try:
2171                                 opts.retries = long(opts.retries)
2172                         except (TypeError, ValueError), err:
2173                                 parser.error(u'invalid retry count specified')
2174
2175                 # Information extractors
2176                 youtube_ie = YoutubeIE()
2177                 metacafe_ie = MetacafeIE(youtube_ie)
2178                 dailymotion_ie = DailymotionIE()
2179                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2180                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2181                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2182                 google_ie = GoogleIE()
2183                 google_search_ie = GoogleSearchIE(google_ie)
2184                 photobucket_ie = PhotobucketIE()
2185                 yahoo_ie = YahooIE()
2186                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2187                 generic_ie = GenericIE()
2188
2189                 # File downloader
2190                 fd = FileDownloader({
2191                         'usenetrc': opts.usenetrc,
2192                         'username': opts.username,
2193                         'password': opts.password,
2194                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2195                         'forceurl': opts.geturl,
2196                         'forcetitle': opts.gettitle,
2197                         'forcethumbnail': opts.getthumbnail,
2198                         'forcedescription': opts.getdescription,
2199                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2200                         'format': opts.format,
2201                         'format_limit': opts.format_limit,
2202                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2203                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2204                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2205                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2206                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2207                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2208                                 or u'%(id)s.%(ext)s'),
2209                         'ignoreerrors': opts.ignoreerrors,
2210                         'ratelimit': opts.ratelimit,
2211                         'nooverwrites': opts.nooverwrites,
2212                         'retries': opts.retries,
2213                         'continuedl': opts.continue_dl,
2214                         'noprogress': opts.noprogress,
2215                         })
2216                 fd.add_info_extractor(youtube_search_ie)
2217                 fd.add_info_extractor(youtube_pl_ie)
2218                 fd.add_info_extractor(youtube_user_ie)
2219                 fd.add_info_extractor(metacafe_ie)
2220                 fd.add_info_extractor(dailymotion_ie)
2221                 fd.add_info_extractor(youtube_ie)
2222                 fd.add_info_extractor(google_ie)
2223                 fd.add_info_extractor(google_search_ie)
2224                 fd.add_info_extractor(photobucket_ie)
2225                 fd.add_info_extractor(yahoo_ie)
2226                 fd.add_info_extractor(yahoo_search_ie)
2227
2228                 # This must come last since it's the
2229                 # fallback if none of the others work
2230                 fd.add_info_extractor(generic_ie)
2231
2232                 # Update version
2233                 if opts.update_self:
2234                         update_self(fd, sys.argv[0])
2235
2236                 # Maybe do nothing
2237                 if len(all_urls) < 1:
2238                         if not opts.update_self:
2239                                 parser.error(u'you must provide at least one URL')
2240                         else:
2241                                 sys.exit()
2242                 retcode = fd.download(all_urls)
2243                 sys.exit(retcode)
2244
2245         except DownloadError:
2246                 sys.exit(1)
2247         except SameFileError:
2248                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2249         except KeyboardInterrupt:
2250                 sys.exit(u'\nERROR: Interrupted by user')