_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.7) Gecko/20100720 Firefox/3.6.7',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableVideoError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         format_limit:   Highest quality format to try.
 193         outtmpl:        Template for output names.
 194         ignoreerrors:   Do not stop on download errors.
 195         ratelimit:      Download speed limit, in bytes/sec.
 196         nooverwrites:   Prevent overwriting files.
 197         retries:        Number of times to retry for HTTP error 503
 198         continuedl:     Try to continue downloads if possible.
 199         noprogress:     Do not print the progress bar.
 200         """
 201
 202         params = None
 203         _ies = []
 204         _pps = []
 205         _download_retcode = None
 206         _num_downloads = None
 207
 208         def __init__(self, params):
 209                 """Create a FileDownloader object with the given options."""
 210                 self._ies = []
 211                 self._pps = []
 212                 self._download_retcode = 0
 213                 self._num_downloads = 0
 214                 self.params = params
 215
 216         @staticmethod
 217         def pmkdir(filename):
 218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 219                 components = filename.split(os.sep)
 220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 222                 for dir in aggregate:
 223                         if not os.path.exists(dir):
 224                                 os.mkdir(dir)
 225
 226         @staticmethod
 227         def format_bytes(bytes):
 228                 if bytes is None:
 229                         return 'N/A'
 230                 if type(bytes) is str:
 231                         bytes = float(bytes)
 232                 if bytes == 0.0:
 233                         exponent = 0
 234                 else:
 235                         exponent = long(math.log(bytes, 1024.0))
 236                 suffix = 'bkMGTPEZY'[exponent]
 237                 converted = float(bytes) / float(1024**exponent)
 238                 return '%.2f%s' % (converted, suffix)
 239
 240         @staticmethod
 241         def calc_percent(byte_counter, data_len):
 242                 if data_len is None:
 243                         return '---.-%'
 244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 245
 246         @staticmethod
 247         def calc_eta(start, now, total, current):
 248                 if total is None:
 249                         return '--:--'
 250                 dif = now - start
 251                 if current == 0 or dif < 0.001: # One millisecond
 252                         return '--:--'
 253                 rate = float(current) / dif
 254                 eta = long((float(total) - float(current)) / rate)
 255                 (eta_mins, eta_secs) = divmod(eta, 60)
 256                 if eta_mins > 99:
 257                         return '--:--'
 258                 return '%02d:%02d' % (eta_mins, eta_secs)
 259
 260         @staticmethod
 261         def calc_speed(start, now, bytes):
 262                 dif = now - start
 263                 if bytes == 0 or dif < 0.001: # One millisecond
 264                         return '%10s' % '---b/s'
 265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 266
 267         @staticmethod
 268         def best_block_size(elapsed_time, bytes):
 269                 new_min = max(bytes / 2.0, 1.0)
 270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 271                 if elapsed_time < 0.001:
 272                         return long(new_max)
 273                 rate = bytes / elapsed_time
 274                 if rate > new_max:
 275                         return long(new_max)
 276                 if rate < new_min:
 277                         return long(new_min)
 278                 return long(rate)
 279
 280         @staticmethod
 281         def parse_bytes(bytestr):
 282                 """Parse a string indicating a byte quantity into a long integer."""
 283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 284                 if matchobj is None:
 285                         return None
 286                 number = float(matchobj.group(1))
 287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 288                 return long(round(number * multiplier))
 289
 290         @staticmethod
 291         def verify_url(url):
 292                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 293                 request = urllib2.Request(url, None, std_headers)
 294                 data = urllib2.urlopen(request)
 295                 data.read(1)
 296                 url = data.geturl()
 297                 data.close()
 298                 return url
 299
 300         def add_info_extractor(self, ie):
 301                 """Add an InfoExtractor object to the end of the list."""
 302                 self._ies.append(ie)
 303                 ie.set_downloader(self)
 304
 305         def add_post_processor(self, pp):
 306                 """Add a PostProcessor object to the end of the chain."""
 307                 self._pps.append(pp)
 308                 pp.set_downloader(self)
 309
 310         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 311                 """Print message to stdout if not in quiet mode."""
 312                 try:
 313                         if not self.params.get('quiet', False):
 314                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 315                         sys.stdout.flush()
 316                 except (UnicodeEncodeError), err:
 317                         if not ignore_encoding_errors:
 318                                 raise
 319
 320         def to_stderr(self, message):
 321                 """Print message to stderr."""
 322                 print >>sys.stderr, message.encode(preferredencoding())
 323
 324         def fixed_template(self):
 325                 """Checks if the output template is fixed."""
 326                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 327
 328         def trouble(self, message=None):
 329                 """Determine action to take when a download problem appears.
 330
 331                 Depending on if the downloader has been configured to ignore
 332                 download errors or not, this method may throw an exception or
 333                 not when errors are found, after printing the message.
 334                 """
 335                 if message is not None:
 336                         self.to_stderr(message)
 337                 if not self.params.get('ignoreerrors', False):
 338                         raise DownloadError(message)
 339                 self._download_retcode = 1
 340
 341         def slow_down(self, start_time, byte_counter):
 342                 """Sleep if the download speed is over the rate limit."""
 343                 rate_limit = self.params.get('ratelimit', None)
 344                 if rate_limit is None or byte_counter == 0:
 345                         return
 346                 now = time.time()
 347                 elapsed = now - start_time
 348                 if elapsed <= 0.0:
 349                         return
 350                 speed = float(byte_counter) / elapsed
 351                 if speed > rate_limit:
 352                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 353
 354         def report_destination(self, filename):
 355                 """Report destination filename."""
 356                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 357
 358         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 359                 """Report download progress."""
 360                 if self.params.get('noprogress', False):
 361                         return
 362                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 363                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 364
 365         def report_resuming_byte(self, resume_len):
 366                 """Report attemtp to resume at given byte."""
 367                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 368
 369         def report_retry(self, count, retries):
 370                 """Report retry in case of HTTP error 503"""
 371                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
 372
 373         def report_file_already_downloaded(self, file_name):
 374                 """Report file has already been fully downloaded."""
 375                 try:
 376                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 377                 except (UnicodeEncodeError), err:
 378                         self.to_stdout(u'[download] The file has already been downloaded')
 379
 380         def report_unable_to_resume(self):
 381                 """Report it was impossible to resume download."""
 382                 self.to_stdout(u'[download] Unable to resume')
 383
 384         def report_finish(self):
 385                 """Report download finished."""
 386                 if self.params.get('noprogress', False):
 387                         self.to_stdout(u'[download] Download completed')
 388                 else:
 389                         self.to_stdout(u'')
 390
 391         def increment_downloads(self):
 392                 """Increment the ordinal that assigns a number to each file."""
 393                 self._num_downloads += 1
 394
 395         def process_info(self, info_dict):
 396                 """Process a single dictionary returned by an InfoExtractor."""
 397                 # Do nothing else if in simulate mode
 398                 if self.params.get('simulate', False):
 399                         # Verify URL if it's an HTTP one
 400                         if info_dict['url'].startswith('http'):
 401                                 try:
 402                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 403                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 404                                         raise UnavailableVideoError
 405
 406                         # Forced printings
 407                         if self.params.get('forcetitle', False):
 408                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 409                         if self.params.get('forceurl', False):
 410                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 411                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 412                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 413                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 414                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 415
 416                         return
 417
 418                 try:
 419                         template_dict = dict(info_dict)
 420                         template_dict['epoch'] = unicode(long(time.time()))
 421                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 422                         filename = self.params['outtmpl'] % template_dict
 423                 except (ValueError, KeyError), err:
 424                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 425                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 426                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 427                         return
 428
 429                 try:
 430                         self.pmkdir(filename)
 431                 except (OSError, IOError), err:
 432                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 433                         return
 434
 435                 try:
 436                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 437                 except (OSError, IOError), err:
 438                         raise UnavailableVideoError
 439                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 440                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 441                         return
 442                 except (ContentTooShortError, ), err:
 443                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 444                         return
 445
 446                 if success:
 447                         try:
 448                                 self.post_process(filename, info_dict)
 449                         except (PostProcessingError), err:
 450                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 451                                 return
 452
 453         def download(self, url_list):
 454                 """Download a given list of URLs."""
 455                 if len(url_list) > 1 and self.fixed_template():
 456                         raise SameFileError(self.params['outtmpl'])
 457
 458                 for url in url_list:
 459                         suitable_found = False
 460                         for ie in self._ies:
 461                                 # Go to next InfoExtractor if not suitable
 462                                 if not ie.suitable(url):
 463                                         continue
 464
 465                                 # Suitable InfoExtractor found
 466                                 suitable_found = True
 467
 468                                 # Extract information from URL and process it
 469                                 ie.extract(url)
 470
 471                                 # Suitable InfoExtractor had been found; go to next URL
 472                                 break
 473
 474                         if not suitable_found:
 475                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 476
 477                 return self._download_retcode
 478
 479         def post_process(self, filename, ie_info):
 480                 """Run the postprocessing chain on the given file."""
 481                 info = dict(ie_info)
 482                 info['filepath'] = filename
 483                 for pp in self._pps:
 484                         info = pp.run(info)
 485                         if info is None:
 486                                 break
 487
 488         def _download_with_rtmpdump(self, filename, url, player_url):
 489                 self.report_destination(filename)
 490
 491                 # Check for rtmpdump first
 492                 try:
 493                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 494                 except (OSError, IOError):
 495                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 496                         return False
 497
 498                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 499                 # the connection was interrumpted and resuming appears to be
 500                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 501                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 502                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 503                 while retval == 2 or retval == 1:
 504                         prevsize = os.path.getsize(filename)
 505                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 506                         time.sleep(5.0) # This seems to be needed
 507                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 508                         cursize = os.path.getsize(filename)
 509                         if prevsize == cursize and retval == 1:
 510                                 break
 511                 if retval == 0:
 512                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 513                         return True
 514                 else:
 515                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 516                         return False
 517
 518         def _do_download(self, filename, url, player_url):
 519                 # Attempt to download using rtmpdump
 520                 if url.startswith('rtmp'):
 521                         return self._download_with_rtmpdump(filename, url, player_url)
 522
 523                 stream = None
 524                 open_mode = 'wb'
 525                 basic_request = urllib2.Request(url, None, std_headers)
 526                 request = urllib2.Request(url, None, std_headers)
 527
 528                 # Establish possible resume length
 529                 if os.path.isfile(filename):
 530                         resume_len = os.path.getsize(filename)
 531                 else:
 532                         resume_len = 0
 533
 534                 # Request parameters in case of being able to resume
 535                 if self.params.get('continuedl', False) and resume_len != 0:
 536                         self.report_resuming_byte(resume_len)
 537                         request.add_header('Range','bytes=%d-' % resume_len)
 538                         open_mode = 'ab'
 539
 540                 count = 0
 541                 retries = self.params.get('retries', 0)
 542                 while True:
 543                         # Establish connection
 544                         try:
 545                                 data = urllib2.urlopen(request)
 546                                 break
 547                         except (urllib2.HTTPError, ), err:
 548                                 if err.code == 503:
 549                                         # Retry in case of HTTP error 503
 550                                         count += 1
 551                                         if count <= retries:
 552                                                 self.report_retry(count, retries)
 553                                                 continue
 554                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
 555                                         raise
 556                                 # Unable to resume
 557                                 data = urllib2.urlopen(basic_request)
 558                                 content_length = data.info()['Content-Length']
 559
 560                                 if content_length is not None and long(content_length) == resume_len:
 561                                         # Because the file had already been fully downloaded
 562                                         self.report_file_already_downloaded(filename)
 563                                         return True
 564                                 else:
 565                                         # Because the server didn't let us
 566                                         self.report_unable_to_resume()
 567                                         open_mode = 'wb'
 568
 569                 data_len = data.info().get('Content-length', None)
 570                 data_len_str = self.format_bytes(data_len)
 571                 byte_counter = 0
 572                 block_size = 1024
 573                 start = time.time()
 574                 while True:
 575                         # Download and write
 576                         before = time.time()
 577                         data_block = data.read(block_size)
 578                         after = time.time()
 579                         data_block_len = len(data_block)
 580                         if data_block_len == 0:
 581                                 break
 582                         byte_counter += data_block_len
 583
 584                         # Open file just in time
 585                         if stream is None:
 586                                 try:
 587                                         (stream, filename) = sanitize_open(filename, open_mode)
 588                                         self.report_destination(filename)
 589                                 except (OSError, IOError), err:
 590                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 591                                         return False
 592                         try:
 593                                 stream.write(data_block)
 594                         except (IOError, OSError), err:
 595                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 596                         block_size = self.best_block_size(after - before, data_block_len)
 597
 598                         # Progress message
 599                         percent_str = self.calc_percent(byte_counter, data_len)
 600                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 601                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 602                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 603
 604                         # Apply rate limit
 605                         self.slow_down(start, byte_counter)
 606
 607                 self.report_finish()
 608                 if data_len is not None and str(byte_counter) != data_len:
 609                         raise ContentTooShortError(byte_counter, long(data_len))
 610                 return True
 611
 612 class InfoExtractor(object):
 613         """Information Extractor class.
 614
 615         Information extractors are the classes that, given a URL, extract
 616         information from the video (or videos) the URL refers to. This
 617         information includes the real video URL, the video title and simplified
 618         title, author and others. The information is stored in a dictionary
 619         which is then passed to the FileDownloader. The FileDownloader
 620         processes this information possibly downloading the video to the file
 621         system, among other possible outcomes. The dictionaries must include
 622         the following fields:
 623
 624         id:             Video identifier.
 625         url:            Final video URL.
 626         uploader:       Nickname of the video uploader.
 627         title:          Literal title.
 628         stitle:         Simplified title.
 629         ext:            Video filename extension.
 630         format:         Video format.
 631         player_url:     SWF Player URL (may be None).
 632
 633         The following fields are optional. Their primary purpose is to allow
 634         youtube-dl to serve as the backend for a video search function, such
 635         as the one in youtube2mp3.  They are only used when their respective
 636         forced printing functions are called:
 637
 638         thumbnail:      Full URL to a video thumbnail image.
 639         description:    One-line video description.
 640
 641         Subclasses of this one should re-define the _real_initialize() and
 642         _real_extract() methods, as well as the suitable() static method.
 643         Probably, they should also be instantiated and added to the main
 644         downloader.
 645         """
 646
 647         _ready = False
 648         _downloader = None
 649
 650         def __init__(self, downloader=None):
 651                 """Constructor. Receives an optional downloader."""
 652                 self._ready = False
 653                 self.set_downloader(downloader)
 654
 655         @staticmethod
 656         def suitable(url):
 657                 """Receives a URL and returns True if suitable for this IE."""
 658                 return False
 659
 660         def initialize(self):
 661                 """Initializes an instance (authentication, etc)."""
 662                 if not self._ready:
 663                         self._real_initialize()
 664                         self._ready = True
 665
 666         def extract(self, url):
 667                 """Extracts URL information and returns it in list of dicts."""
 668                 self.initialize()
 669                 return self._real_extract(url)
 670
 671         def set_downloader(self, downloader):
 672                 """Sets the downloader for this IE."""
 673                 self._downloader = downloader
 674
 675         def _real_initialize(self):
 676                 """Real initialization process. Redefine in subclasses."""
 677                 pass
 678
 679         def _real_extract(self, url):
 680                 """Real extraction process. Redefine in subclasses."""
 681                 pass
 682
 683 class YoutubeIE(InfoExtractor):
 684         """Information extractor for youtube.com."""
 685
 686         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
 687         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 688         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 689         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 690         _NETRC_MACHINE = 'youtube'
 691         # Listed in order of quality
 692         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 693         _video_extensions = {
 694                 '13': '3gp',
 695                 '17': 'mp4',
 696                 '18': 'mp4',
 697                 '22': 'mp4',
 698                 '37': 'mp4',
 699                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 700                 '43': 'webm',
 701                 '45': 'webm',
 702         }
 703
 704         @staticmethod
 705         def suitable(url):
 706                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 707
 708         def report_lang(self):
 709                 """Report attempt to set language."""
 710                 self._downloader.to_stdout(u'[youtube] Setting language')
 711
 712         def report_login(self):
 713                 """Report attempt to log in."""
 714                 self._downloader.to_stdout(u'[youtube] Logging in')
 715
 716         def report_age_confirmation(self):
 717                 """Report attempt to confirm age."""
 718                 self._downloader.to_stdout(u'[youtube] Confirming age')
 719
 720         def report_video_webpage_download(self, video_id):
 721                 """Report attempt to download video webpage."""
 722                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 723
 724         def report_video_info_webpage_download(self, video_id):
 725                 """Report attempt to download video info webpage."""
 726                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 727
 728         def report_information_extraction(self, video_id):
 729                 """Report attempt to extract video information."""
 730                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 731
 732         def report_unavailable_format(self, video_id, format):
 733                 """Report extracted video URL."""
 734                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 735
 736         def report_rtmp_download(self):
 737                 """Indicate the download will use the RTMP protocol."""
 738                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 739
 740         def _real_initialize(self):
 741                 if self._downloader is None:
 742                         return
 743
 744                 username = None
 745                 password = None
 746                 downloader_params = self._downloader.params
 747
 748                 # Attempt to use provided username and password or .netrc data
 749                 if downloader_params.get('username', None) is not None:
 750                         username = downloader_params['username']
 751                         password = downloader_params['password']
 752                 elif downloader_params.get('usenetrc', False):
 753                         try:
 754                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 755                                 if info is not None:
 756                                         username = info[0]
 757                                         password = info[2]
 758                                 else:
 759                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 760                         except (IOError, netrc.NetrcParseError), err:
 761                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 762                                 return
 763
 764                 # Set language
 765                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 766                 try:
 767                         self.report_lang()
 768                         urllib2.urlopen(request).read()
 769                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 770                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 771                         return
 772
 773                 # No authentication to be performed
 774                 if username is None:
 775                         return
 776
 777                 # Log in
 778                 login_form = {
 779                                 'current_form': 'loginForm',
 780                                 'next':         '/',
 781                                 'action_login': 'Log In',
 782                                 'username':     username,
 783                                 'password':     password,
 784                                 }
 785                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 786                 try:
 787                         self.report_login()
 788                         login_results = urllib2.urlopen(request).read()
 789                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 790                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 791                                 return
 792                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 793                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 794                         return
 795
 796                 # Confirm age
 797                 age_form = {
 798                                 'next_url':             '/',
 799                                 'action_confirm':       'Confirm',
 800                                 }
 801                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 802                 try:
 803                         self.report_age_confirmation()
 804                         age_results = urllib2.urlopen(request).read()
 805                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 806                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 807                         return
 808
 809         def _real_extract(self, url):
 810                 # Extract video id from URL
 811                 mobj = re.match(self._VALID_URL, url)
 812                 if mobj is None:
 813                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 814                         return
 815                 video_id = mobj.group(2)
 816
 817                 # Get video webpage
 818                 self.report_video_webpage_download(video_id)
 819                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 820                 try:
 821                         video_webpage = urllib2.urlopen(request).read()
 822                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 823                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 824                         return
 825
 826                 # Attempt to extract SWF player URL
 827                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
 828                 if mobj is not None:
 829                         player_url = mobj.group(1)
 830                 else:
 831                         player_url = None
 832
 833                 # Get video info
 834                 self.report_video_info_webpage_download(video_id)
 835                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 836                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 837                                            % (video_id, el_type))
 838                         request = urllib2.Request(video_info_url, None, std_headers)
 839                         try:
 840                                 video_info_webpage = urllib2.urlopen(request).read()
 841                                 video_info = parse_qs(video_info_webpage)
 842                                 if 'token' in video_info:
 843                                         break
 844                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 845                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 846                                 return
 847                 if 'token' not in video_info:
 848                         if 'reason' in video_info:
 849                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 850                         else:
 851                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 852                         return
 853
 854                 # Start extracting information
 855                 self.report_information_extraction(video_id)
 856
 857                 # uploader
 858                 if 'author' not in video_info:
 859                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 860                         return
 861                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 862
 863                 # title
 864                 if 'title' not in video_info:
 865                         self._downloader.trouble(u'ERROR: unable to extract video title')
 866                         return
 867                 video_title = urllib.unquote_plus(video_info['title'][0])
 868                 video_title = video_title.decode('utf-8')
 869                 video_title = sanitize_title(video_title)
 870
 871                 # simplified title
 872                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 873                 simple_title = simple_title.strip(ur'_')
 874
 875                 # thumbnail image
 876                 if 'thumbnail_url' not in video_info:
 877                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 878                         video_thumbnail = ''
 879                 else:   # don't panic if we can't find it
 880                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 881
 882                 # description
 883                 video_description = 'No description available.'
 884                 if self._downloader.params.get('forcedescription', False):
 885                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 886                         if mobj is not None:
 887                                 video_description = mobj.group(1)
 888
 889                 # token
 890                 video_token = urllib.unquote_plus(video_info['token'][0])
 891
 892                 # Decide which formats to download
 893                 requested_format = self._downloader.params.get('format', None)
 894                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
 895
 896                 if 'fmt_url_map' in video_info:
 897                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
 898                         format_limit = self._downloader.params.get('format_limit', None)
 899                         if format_limit is not None and format_limit in self._available_formats:
 900                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
 901                         else:
 902                                 format_list = self._available_formats
 903                         existing_formats = [x for x in format_list if x in url_map]
 904                         if len(existing_formats) == 0:
 905                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 906                                 return
 907                         if requested_format is None:
 908                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
 909                         elif requested_format == '-1':
 910                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
 911                         else:
 912                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
 913
 914                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 915                         self.report_rtmp_download()
 916                         video_url_list = [(None, video_info['conn'][0])]
 917
 918                 else:
 919                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
 920                         return
 921
 922                 for format_param, video_real_url in video_url_list:
 923                         # At this point we have a new video
 924                         self._downloader.increment_downloads()
 925
 926                         # Extension
 927                         video_extension = self._video_extensions.get(format_param, 'flv')
 928
 929                         # Find the video URL in fmt_url_map or conn paramters
 930                         try:
 931                                 # Process video information
 932                                 self._downloader.process_info({
 933                                         'id':           video_id.decode('utf-8'),
 934                                         'url':          video_real_url.decode('utf-8'),
 935                                         'uploader':     video_uploader.decode('utf-8'),
 936                                         'title':        video_title,
 937                                         'stitle':       simple_title,
 938                                         'ext':          video_extension.decode('utf-8'),
 939                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 940                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 941                                         'description':  video_description.decode('utf-8'),
 942                                         'player_url':   player_url,
 943                                 })
 944                         except UnavailableVideoError, err:
 945                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
 946
 947
 948 class MetacafeIE(InfoExtractor):
 949         """Information Extractor for metacafe.com."""
 950
 951         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 952         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 953         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 954         _youtube_ie = None
 955
 956         def __init__(self, youtube_ie, downloader=None):
 957                 InfoExtractor.__init__(self, downloader)
 958                 self._youtube_ie = youtube_ie
 959
 960         @staticmethod
 961         def suitable(url):
 962                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 963
 964         def report_disclaimer(self):
 965                 """Report disclaimer retrieval."""
 966                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 967
 968         def report_age_confirmation(self):
 969                 """Report attempt to confirm age."""
 970                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 971
 972         def report_download_webpage(self, video_id):
 973                 """Report webpage download."""
 974                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 975
 976         def report_extraction(self, video_id):
 977                 """Report information extraction."""
 978                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 979
 980         def _real_initialize(self):
 981                 # Retrieve disclaimer
 982                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 983                 try:
 984                         self.report_disclaimer()
 985                         disclaimer = urllib2.urlopen(request).read()
 986                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 987                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 988                         return
 989
 990                 # Confirm age
 991                 disclaimer_form = {
 992                         'filters': '0',
 993                         'submit': "Continue - I'm over 18",
 994                         }
 995                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 996                 try:
 997                         self.report_age_confirmation()
 998                         disclaimer = urllib2.urlopen(request).read()
 999                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1000                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1001                         return
1002
1003         def _real_extract(self, url):
1004                 # Extract id and simplified title from URL
1005                 mobj = re.match(self._VALID_URL, url)
1006                 if mobj is None:
1007                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1008                         return
1009
1010                 video_id = mobj.group(1)
1011
1012                 # Check if video comes from YouTube
1013                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1014                 if mobj2 is not None:
1015                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1016                         return
1017
1018                 # At this point we have a new video
1019                 self._downloader.increment_downloads()
1020
1021                 simple_title = mobj.group(2).decode('utf-8')
1022                 video_extension = 'flv'
1023
1024                 # Retrieve video webpage to extract further information
1025                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1026                 try:
1027                         self.report_download_webpage(video_id)
1028                         webpage = urllib2.urlopen(request).read()
1029                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1030                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1031                         return
1032
1033                 # Extract URL, uploader and title from webpage
1034                 self.report_extraction(video_id)
1035                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1036                 if mobj is None:
1037                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1038                         return
1039                 mediaURL = urllib.unquote(mobj.group(1))
1040
1041                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1042                 #if mobj is None:
1043                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1044                 #       return
1045                 #gdaKey = mobj.group(1)
1046                 #
1047                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1048
1049                 video_url = mediaURL
1050
1051                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1052                 if mobj is None:
1053                         self._downloader.trouble(u'ERROR: unable to extract title')
1054                         return
1055                 video_title = mobj.group(1).decode('utf-8')
1056                 video_title = sanitize_title(video_title)
1057
1058                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1059                 if mobj is None:
1060                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1061                         return
1062                 video_uploader = mobj.group(1)
1063
1064                 try:
1065                         # Process video information
1066                         self._downloader.process_info({
1067                                 'id':           video_id.decode('utf-8'),
1068                                 'url':          video_url.decode('utf-8'),
1069                                 'uploader':     video_uploader.decode('utf-8'),
1070                                 'title':        video_title,
1071                                 'stitle':       simple_title,
1072                                 'ext':          video_extension.decode('utf-8'),
1073                                 'format':       u'NA',
1074                                 'player_url':   None,
1075                         })
1076                 except UnavailableVideoError:
1077                         self._downloader.trouble(u'ERROR: unable to download video')
1078
1079
1080 class DailymotionIE(InfoExtractor):
1081         """Information Extractor for Dailymotion"""
1082
1083         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1084
1085         def __init__(self, downloader=None):
1086                 InfoExtractor.__init__(self, downloader)
1087
1088         @staticmethod
1089         def suitable(url):
1090                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1091
1092         def report_download_webpage(self, video_id):
1093                 """Report webpage download."""
1094                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1095
1096         def report_extraction(self, video_id):
1097                 """Report information extraction."""
1098                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1099
1100         def _real_initialize(self):
1101                 return
1102
1103         def _real_extract(self, url):
1104                 # Extract id and simplified title from URL
1105                 mobj = re.match(self._VALID_URL, url)
1106                 if mobj is None:
1107                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1108                         return
1109
1110                 # At this point we have a new video
1111                 self._downloader.increment_downloads()
1112                 video_id = mobj.group(1)
1113
1114                 simple_title = mobj.group(2).decode('utf-8')
1115                 video_extension = 'flv'
1116
1117                 # Retrieve video webpage to extract further information
1118                 request = urllib2.Request(url)
1119                 try:
1120                         self.report_download_webpage(video_id)
1121                         webpage = urllib2.urlopen(request).read()
1122                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1123                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1124                         return
1125
1126                 # Extract URL, uploader and title from webpage
1127                 self.report_extraction(video_id)
1128                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1129                 if mobj is None:
1130                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1131                         return
1132                 mediaURL = urllib.unquote(mobj.group(1))
1133
1134                 # if needed add http://www.dailymotion.com/ if relative URL
1135
1136                 video_url = mediaURL
1137
1138                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1139                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1140                 if mobj is None:
1141                         self._downloader.trouble(u'ERROR: unable to extract title')
1142                         return
1143                 video_title = mobj.group(1).decode('utf-8')
1144                 video_title = sanitize_title(video_title)
1145
1146                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1147                 if mobj is None:
1148                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1149                         return
1150                 video_uploader = mobj.group(1)
1151
1152                 try:
1153                         # Process video information
1154                         self._downloader.process_info({
1155                                 'id':           video_id.decode('utf-8'),
1156                                 'url':          video_url.decode('utf-8'),
1157                                 'uploader':     video_uploader.decode('utf-8'),
1158                                 'title':        video_title,
1159                                 'stitle':       simple_title,
1160                                 'ext':          video_extension.decode('utf-8'),
1161                                 'format':       u'NA',
1162                                 'player_url':   None,
1163                         })
1164                 except UnavailableVideoError:
1165                         self._downloader.trouble(u'ERROR: unable to download video')
1166
1167 class GoogleIE(InfoExtractor):
1168         """Information extractor for video.google.com."""
1169
1170         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1171
1172         def __init__(self, downloader=None):
1173                 InfoExtractor.__init__(self, downloader)
1174
1175         @staticmethod
1176         def suitable(url):
1177                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1178
1179         def report_download_webpage(self, video_id):
1180                 """Report webpage download."""
1181                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1182
1183         def report_extraction(self, video_id):
1184                 """Report information extraction."""
1185                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1186
1187         def _real_initialize(self):
1188                 return
1189
1190         def _real_extract(self, url):
1191                 # Extract id from URL
1192                 mobj = re.match(self._VALID_URL, url)
1193                 if mobj is None:
1194                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1195                         return
1196
1197                 # At this point we have a new video
1198                 self._downloader.increment_downloads()
1199                 video_id = mobj.group(1)
1200
1201                 video_extension = 'mp4'
1202
1203                 # Retrieve video webpage to extract further information
1204                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1205                 try:
1206                         self.report_download_webpage(video_id)
1207                         webpage = urllib2.urlopen(request).read()
1208                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1209                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1210                         return
1211
1212                 # Extract URL, uploader, and title from webpage
1213                 self.report_extraction(video_id)
1214                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1215                 if mobj is None:
1216                         video_extension = 'flv'
1217                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1218                 if mobj is None:
1219                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1220                         return
1221                 mediaURL = urllib.unquote(mobj.group(1))
1222                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1223                 mediaURL = mediaURL.replace('\\x26', '\x26')
1224
1225                 video_url = mediaURL
1226
1227                 mobj = re.search(r'<title>(.*)</title>', webpage)
1228                 if mobj is None:
1229                         self._downloader.trouble(u'ERROR: unable to extract title')
1230                         return
1231                 video_title = mobj.group(1).decode('utf-8')
1232                 video_title = sanitize_title(video_title)
1233                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1234
1235                 # Extract video description
1236                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1237                 if mobj is None:
1238                         self._downloader.trouble(u'ERROR: unable to extract video description')
1239                         return
1240                 video_description = mobj.group(1).decode('utf-8')
1241                 if not video_description:
1242                         video_description = 'No description available.'
1243
1244                 # Extract video thumbnail
1245                 if self._downloader.params.get('forcethumbnail', False):
1246                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1247                         try:
1248                                 webpage = urllib2.urlopen(request).read()
1249                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1250                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1251                                 return
1252                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1253                         if mobj is None:
1254                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1255                                 return
1256                         video_thumbnail = mobj.group(1)
1257                 else:   # we need something to pass to process_info
1258                         video_thumbnail = ''
1259
1260
1261                 try:
1262                         # Process video information
1263                         self._downloader.process_info({
1264                                 'id':           video_id.decode('utf-8'),
1265                                 'url':          video_url.decode('utf-8'),
1266                                 'uploader':     u'NA',
1267                                 'title':        video_title,
1268                                 'stitle':       simple_title,
1269                                 'ext':          video_extension.decode('utf-8'),
1270                                 'format':       u'NA',
1271                                 'player_url':   None,
1272                         })
1273                 except UnavailableVideoError:
1274                         self._downloader.trouble(u'ERROR: unable to download video')
1275
1276
1277 class PhotobucketIE(InfoExtractor):
1278         """Information extractor for photobucket.com."""
1279
1280         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1281
1282         def __init__(self, downloader=None):
1283                 InfoExtractor.__init__(self, downloader)
1284
1285         @staticmethod
1286         def suitable(url):
1287                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1288
1289         def report_download_webpage(self, video_id):
1290                 """Report webpage download."""
1291                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1292
1293         def report_extraction(self, video_id):
1294                 """Report information extraction."""
1295                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1296
1297         def _real_initialize(self):
1298                 return
1299
1300         def _real_extract(self, url):
1301                 # Extract id from URL
1302                 mobj = re.match(self._VALID_URL, url)
1303                 if mobj is None:
1304                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1305                         return
1306
1307                 # At this point we have a new video
1308                 self._downloader.increment_downloads()
1309                 video_id = mobj.group(1)
1310
1311                 video_extension = 'flv'
1312
1313                 # Retrieve video webpage to extract further information
1314                 request = urllib2.Request(url)
1315                 try:
1316                         self.report_download_webpage(video_id)
1317                         webpage = urllib2.urlopen(request).read()
1318                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1319                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1320                         return
1321
1322                 # Extract URL, uploader, and title from webpage
1323                 self.report_extraction(video_id)
1324                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1325                 if mobj is None:
1326                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1327                         return
1328                 mediaURL = urllib.unquote(mobj.group(1))
1329
1330                 video_url = mediaURL
1331
1332                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1333                 if mobj is None:
1334                         self._downloader.trouble(u'ERROR: unable to extract title')
1335                         return
1336                 video_title = mobj.group(1).decode('utf-8')
1337                 video_title = sanitize_title(video_title)
1338                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1339
1340                 video_uploader = mobj.group(2).decode('utf-8')
1341
1342                 try:
1343                         # Process video information
1344                         self._downloader.process_info({
1345                                 'id':           video_id.decode('utf-8'),
1346                                 'url':          video_url.decode('utf-8'),
1347                                 'uploader':     video_uploader,
1348                                 'title':        video_title,
1349                                 'stitle':       simple_title,
1350                                 'ext':          video_extension.decode('utf-8'),
1351                                 'format':       u'NA',
1352                                 'player_url':   None,
1353                         })
1354                 except UnavailableVideoError:
1355                         self._downloader.trouble(u'ERROR: unable to download video')
1356
1357
1358 class YahooIE(InfoExtractor):
1359         """Information extractor for video.yahoo.com."""
1360
1361         # _VALID_URL matches all Yahoo! Video URLs
1362         # _VPAGE_URL matches only the extractable '/watch/' URLs
1363         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1364         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1365
1366         def __init__(self, downloader=None):
1367                 InfoExtractor.__init__(self, downloader)
1368
1369         @staticmethod
1370         def suitable(url):
1371                 return (re.match(YahooIE._VALID_URL, url) is not None)
1372
1373         def report_download_webpage(self, video_id):
1374                 """Report webpage download."""
1375                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1376
1377         def report_extraction(self, video_id):
1378                 """Report information extraction."""
1379                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1380
1381         def _real_initialize(self):
1382                 return
1383
1384         def _real_extract(self, url, new_video=True):
1385                 # Extract ID from URL
1386                 mobj = re.match(self._VALID_URL, url)
1387                 if mobj is None:
1388                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1389                         return
1390
1391                 # At this point we have a new video
1392                 self._downloader.increment_downloads()
1393                 video_id = mobj.group(2)
1394                 video_extension = 'flv'
1395
1396                 # Rewrite valid but non-extractable URLs as
1397                 # extractable English language /watch/ URLs
1398                 if re.match(self._VPAGE_URL, url) is None:
1399                         request = urllib2.Request(url)
1400                         try:
1401                                 webpage = urllib2.urlopen(request).read()
1402                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1403                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1404                                 return
1405
1406                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1407                         if mobj is None:
1408                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1409                                 return
1410                         yahoo_id = mobj.group(1)
1411
1412                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1413                         if mobj is None:
1414                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1415                                 return
1416                         yahoo_vid = mobj.group(1)
1417
1418                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1419                         return self._real_extract(url, new_video=False)
1420
1421                 # Retrieve video webpage to extract further information
1422                 request = urllib2.Request(url)
1423                 try:
1424                         self.report_download_webpage(video_id)
1425                         webpage = urllib2.urlopen(request).read()
1426                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1427                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1428                         return
1429
1430                 # Extract uploader and title from webpage
1431                 self.report_extraction(video_id)
1432                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1433                 if mobj is None:
1434                         self._downloader.trouble(u'ERROR: unable to extract video title')
1435                         return
1436                 video_title = mobj.group(1).decode('utf-8')
1437                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1438
1439                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1440                 if mobj is None:
1441                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1442                         return
1443                 video_uploader = mobj.group(1).decode('utf-8')
1444
1445                 # Extract video thumbnail
1446                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1447                 if mobj is None:
1448                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1449                         return
1450                 video_thumbnail = mobj.group(1).decode('utf-8')
1451
1452                 # Extract video description
1453                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1454                 if mobj is None:
1455                         self._downloader.trouble(u'ERROR: unable to extract video description')
1456                         return
1457                 video_description = mobj.group(1).decode('utf-8')
1458                 if not video_description: video_description = 'No description available.'
1459
1460                 # Extract video height and width
1461                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: unable to extract video height')
1464                         return
1465                 yv_video_height = mobj.group(1)
1466
1467                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1468                 if mobj is None:
1469                         self._downloader.trouble(u'ERROR: unable to extract video width')
1470                         return
1471                 yv_video_width = mobj.group(1)
1472
1473                 # Retrieve video playlist to extract media URL
1474                 # I'm not completely sure what all these options are, but we
1475                 # seem to need most of them, otherwise the server sends a 401.
1476                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1477                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1478                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1479                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1480                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1481                 try:
1482                         self.report_download_webpage(video_id)
1483                         webpage = urllib2.urlopen(request).read()
1484                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1485                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1486                         return
1487
1488                 # Extract media URL from playlist XML
1489                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1490                 if mobj is None:
1491                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1492                         return
1493                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1494                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1495
1496                 try:
1497                         # Process video information
1498                         self._downloader.process_info({
1499                                 'id':           video_id.decode('utf-8'),
1500                                 'url':          video_url,
1501                                 'uploader':     video_uploader,
1502                                 'title':        video_title,
1503                                 'stitle':       simple_title,
1504                                 'ext':          video_extension.decode('utf-8'),
1505                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1506                                 'description':  video_description,
1507                                 'thumbnail':    video_thumbnail,
1508                                 'description':  video_description,
1509                                 'player_url':   None,
1510                         })
1511                 except UnavailableVideoError:
1512                         self._downloader.trouble(u'ERROR: unable to download video')
1513
1514
1515 class GenericIE(InfoExtractor):
1516         """Generic last-resort information extractor."""
1517
1518         def __init__(self, downloader=None):
1519                 InfoExtractor.__init__(self, downloader)
1520
1521         @staticmethod
1522         def suitable(url):
1523                 return True
1524
1525         def report_download_webpage(self, video_id):
1526                 """Report webpage download."""
1527                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1528                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1529
1530         def report_extraction(self, video_id):
1531                 """Report information extraction."""
1532                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1533
1534         def _real_initialize(self):
1535                 return
1536
1537         def _real_extract(self, url):
1538                 # At this point we have a new video
1539                 self._downloader.increment_downloads()
1540
1541                 video_id = url.split('/')[-1]
1542                 request = urllib2.Request(url)
1543                 try:
1544                         self.report_download_webpage(video_id)
1545                         webpage = urllib2.urlopen(request).read()
1546                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1547                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1548                         return
1549                 except ValueError, err:
1550                         # since this is the last-resort InfoExtractor, if
1551                         # this error is thrown, it'll be thrown here
1552                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1553                         return
1554
1555                 # Start with something easy: JW Player in SWFObject
1556                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1557                 if mobj is None:
1558                         # Broaden the search a little bit
1559                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1560                 if mobj is None:
1561                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1562                         return
1563
1564                 # It's possible that one of the regexes
1565                 # matched, but returned an empty group:
1566                 if mobj.group(1) is None:
1567                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1568                         return
1569
1570                 video_url = urllib.unquote(mobj.group(1))
1571                 video_id  = os.path.basename(video_url)
1572
1573                 # here's a fun little line of code for you:
1574                 video_extension = os.path.splitext(video_id)[1][1:]
1575                 video_id        = os.path.splitext(video_id)[0]
1576
1577                 # it's tempting to parse this further, but you would
1578                 # have to take into account all the variations like
1579                 #   Video Title - Site Name
1580                 #   Site Name | Video Title
1581                 #   Video Title - Tagline | Site Name
1582                 # and so on and so forth; it's just not practical
1583                 mobj = re.search(r'<title>(.*)</title>', webpage)
1584                 if mobj is None:
1585                         self._downloader.trouble(u'ERROR: unable to extract title')
1586                         return
1587                 video_title = mobj.group(1).decode('utf-8')
1588                 video_title = sanitize_title(video_title)
1589                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1590
1591                 # video uploader is domain name
1592                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1593                 if mobj is None:
1594                         self._downloader.trouble(u'ERROR: unable to extract title')
1595                         return
1596                 video_uploader = mobj.group(1).decode('utf-8')
1597
1598                 try:
1599                         # Process video information
1600                         self._downloader.process_info({
1601                                 'id':           video_id.decode('utf-8'),
1602                                 'url':          video_url.decode('utf-8'),
1603                                 'uploader':     video_uploader,
1604                                 'title':        video_title,
1605                                 'stitle':       simple_title,
1606                                 'ext':          video_extension.decode('utf-8'),
1607                                 'format':       u'NA',
1608                                 'player_url':   None,
1609                         })
1610                 except UnavailableVideoError, err:
1611                         self._downloader.trouble(u'ERROR: unable to download video')
1612
1613
1614 class YoutubeSearchIE(InfoExtractor):
1615         """Information Extractor for YouTube search queries."""
1616         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1617         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1618         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1619         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1620         _youtube_ie = None
1621         _max_youtube_results = 1000
1622
1623         def __init__(self, youtube_ie, downloader=None):
1624                 InfoExtractor.__init__(self, downloader)
1625                 self._youtube_ie = youtube_ie
1626
1627         @staticmethod
1628         def suitable(url):
1629                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1630
1631         def report_download_page(self, query, pagenum):
1632                 """Report attempt to download playlist page with given number."""
1633                 query = query.decode(preferredencoding())
1634                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1635
1636         def _real_initialize(self):
1637                 self._youtube_ie.initialize()
1638
1639         def _real_extract(self, query):
1640                 mobj = re.match(self._VALID_QUERY, query)
1641                 if mobj is None:
1642                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1643                         return
1644
1645                 prefix, query = query.split(':')
1646                 prefix = prefix[8:]
1647                 query  = query.encode('utf-8')
1648                 if prefix == '':
1649                         self._download_n_results(query, 1)
1650                         return
1651                 elif prefix == 'all':
1652                         self._download_n_results(query, self._max_youtube_results)
1653                         return
1654                 else:
1655                         try:
1656                                 n = long(prefix)
1657                                 if n <= 0:
1658                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1659                                         return
1660                                 elif n > self._max_youtube_results:
1661                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1662                                         n = self._max_youtube_results
1663                                 self._download_n_results(query, n)
1664                                 return
1665                         except ValueError: # parsing prefix as integer fails
1666                                 self._download_n_results(query, 1)
1667                                 return
1668
1669         def _download_n_results(self, query, n):
1670                 """Downloads a specified number of results for a query"""
1671
1672                 video_ids = []
1673                 already_seen = set()
1674                 pagenum = 1
1675
1676                 while True:
1677                         self.report_download_page(query, pagenum)
1678                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1679                         request = urllib2.Request(result_url, None, std_headers)
1680                         try:
1681                                 page = urllib2.urlopen(request).read()
1682                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1684                                 return
1685
1686                         # Extract video identifiers
1687                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1688                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1689                                 if video_id not in already_seen:
1690                                         video_ids.append(video_id)
1691                                         already_seen.add(video_id)
1692                                         if len(video_ids) == n:
1693                                                 # Specified n videos reached
1694                                                 for id in video_ids:
1695                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1696                                                 return
1697
1698                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1699                                 for id in video_ids:
1700                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1701                                 return
1702
1703                         pagenum = pagenum + 1
1704
1705 class GoogleSearchIE(InfoExtractor):
1706         """Information Extractor for Google Video search queries."""
1707         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1708         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1709         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1710         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1711         _google_ie = None
1712         _max_google_results = 1000
1713
1714         def __init__(self, google_ie, downloader=None):
1715                 InfoExtractor.__init__(self, downloader)
1716                 self._google_ie = google_ie
1717
1718         @staticmethod
1719         def suitable(url):
1720                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1721
1722         def report_download_page(self, query, pagenum):
1723                 """Report attempt to download playlist page with given number."""
1724                 query = query.decode(preferredencoding())
1725                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1726
1727         def _real_initialize(self):
1728                 self._google_ie.initialize()
1729
1730         def _real_extract(self, query):
1731                 mobj = re.match(self._VALID_QUERY, query)
1732                 if mobj is None:
1733                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1734                         return
1735
1736                 prefix, query = query.split(':')
1737                 prefix = prefix[8:]
1738                 query  = query.encode('utf-8')
1739                 if prefix == '':
1740                         self._download_n_results(query, 1)
1741                         return
1742                 elif prefix == 'all':
1743                         self._download_n_results(query, self._max_google_results)
1744                         return
1745                 else:
1746                         try:
1747                                 n = long(prefix)
1748                                 if n <= 0:
1749                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1750                                         return
1751                                 elif n > self._max_google_results:
1752                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1753                                         n = self._max_google_results
1754                                 self._download_n_results(query, n)
1755                                 return
1756                         except ValueError: # parsing prefix as integer fails
1757                                 self._download_n_results(query, 1)
1758                                 return
1759
1760         def _download_n_results(self, query, n):
1761                 """Downloads a specified number of results for a query"""
1762
1763                 video_ids = []
1764                 already_seen = set()
1765                 pagenum = 1
1766
1767                 while True:
1768                         self.report_download_page(query, pagenum)
1769                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1770                         request = urllib2.Request(result_url, None, std_headers)
1771                         try:
1772                                 page = urllib2.urlopen(request).read()
1773                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1774                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1775                                 return
1776
1777                         # Extract video identifiers
1778                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1779                                 video_id = mobj.group(1)
1780                                 if video_id not in already_seen:
1781                                         video_ids.append(video_id)
1782                                         already_seen.add(video_id)
1783                                         if len(video_ids) == n:
1784                                                 # Specified n videos reached
1785                                                 for id in video_ids:
1786                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1787                                                 return
1788
1789                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1790                                 for id in video_ids:
1791                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1792                                 return
1793
1794                         pagenum = pagenum + 1
1795
1796 class YahooSearchIE(InfoExtractor):
1797         """Information Extractor for Yahoo! Video search queries."""
1798         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1799         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1800         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1801         _MORE_PAGES_INDICATOR = r'\s*Next'
1802         _yahoo_ie = None
1803         _max_yahoo_results = 1000
1804
1805         def __init__(self, yahoo_ie, downloader=None):
1806                 InfoExtractor.__init__(self, downloader)
1807                 self._yahoo_ie = yahoo_ie
1808
1809         @staticmethod
1810         def suitable(url):
1811                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1812
1813         def report_download_page(self, query, pagenum):
1814                 """Report attempt to download playlist page with given number."""
1815                 query = query.decode(preferredencoding())
1816                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1817
1818         def _real_initialize(self):
1819                 self._yahoo_ie.initialize()
1820
1821         def _real_extract(self, query):
1822                 mobj = re.match(self._VALID_QUERY, query)
1823                 if mobj is None:
1824                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1825                         return
1826
1827                 prefix, query = query.split(':')
1828                 prefix = prefix[8:]
1829                 query  = query.encode('utf-8')
1830                 if prefix == '':
1831                         self._download_n_results(query, 1)
1832                         return
1833                 elif prefix == 'all':
1834                         self._download_n_results(query, self._max_yahoo_results)
1835                         return
1836                 else:
1837                         try:
1838                                 n = long(prefix)
1839                                 if n <= 0:
1840                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1841                                         return
1842                                 elif n > self._max_yahoo_results:
1843                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1844                                         n = self._max_yahoo_results
1845                                 self._download_n_results(query, n)
1846                                 return
1847                         except ValueError: # parsing prefix as integer fails
1848                                 self._download_n_results(query, 1)
1849                                 return
1850
1851         def _download_n_results(self, query, n):
1852                 """Downloads a specified number of results for a query"""
1853
1854                 video_ids = []
1855                 already_seen = set()
1856                 pagenum = 1
1857
1858                 while True:
1859                         self.report_download_page(query, pagenum)
1860                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1861                         request = urllib2.Request(result_url, None, std_headers)
1862                         try:
1863                                 page = urllib2.urlopen(request).read()
1864                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1865                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1866                                 return
1867
1868                         # Extract video identifiers
1869                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1870                                 video_id = mobj.group(1)
1871                                 if video_id not in already_seen:
1872                                         video_ids.append(video_id)
1873                                         already_seen.add(video_id)
1874                                         if len(video_ids) == n:
1875                                                 # Specified n videos reached
1876                                                 for id in video_ids:
1877                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1878                                                 return
1879
1880                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1881                                 for id in video_ids:
1882                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1883                                 return
1884
1885                         pagenum = pagenum + 1
1886
1887 class YoutubePlaylistIE(InfoExtractor):
1888         """Information Extractor for YouTube playlists."""
1889
1890         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1891         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1892         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1893         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1894         _youtube_ie = None
1895
1896         def __init__(self, youtube_ie, downloader=None):
1897                 InfoExtractor.__init__(self, downloader)
1898                 self._youtube_ie = youtube_ie
1899
1900         @staticmethod
1901         def suitable(url):
1902                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1903
1904         def report_download_page(self, playlist_id, pagenum):
1905                 """Report attempt to download playlist page with given number."""
1906                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1907
1908         def _real_initialize(self):
1909                 self._youtube_ie.initialize()
1910
1911         def _real_extract(self, url):
1912                 # Extract playlist id
1913                 mobj = re.match(self._VALID_URL, url)
1914                 if mobj is None:
1915                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1916                         return
1917
1918                 # Download playlist pages
1919                 playlist_id = mobj.group(1)
1920                 video_ids = []
1921                 pagenum = 1
1922
1923                 while True:
1924                         self.report_download_page(playlist_id, pagenum)
1925                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1926                         try:
1927                                 page = urllib2.urlopen(request).read()
1928                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1929                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1930                                 return
1931
1932                         # Extract video identifiers
1933                         ids_in_page = []
1934                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1935                                 if mobj.group(1) not in ids_in_page:
1936                                         ids_in_page.append(mobj.group(1))
1937                         video_ids.extend(ids_in_page)
1938
1939                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1940                                 break
1941                         pagenum = pagenum + 1
1942
1943                 for id in video_ids:
1944                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1945                 return
1946
1947 class YoutubeUserIE(InfoExtractor):
1948         """Information Extractor for YouTube users."""
1949
1950         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1951         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1952         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1953         _youtube_ie = None
1954
1955         def __init__(self, youtube_ie, downloader=None):
1956                 InfoExtractor.__init__(self, downloader)
1957                 self._youtube_ie = youtube_ie
1958
1959         @staticmethod
1960         def suitable(url):
1961                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1962
1963         def report_download_page(self, username):
1964                 """Report attempt to download user page."""
1965                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1966
1967         def _real_initialize(self):
1968                 self._youtube_ie.initialize()
1969
1970         def _real_extract(self, url):
1971                 # Extract username
1972                 mobj = re.match(self._VALID_URL, url)
1973                 if mobj is None:
1974                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1975                         return
1976
1977                 # Download user page
1978                 username = mobj.group(1)
1979                 video_ids = []
1980                 pagenum = 1
1981
1982                 self.report_download_page(username)
1983                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1984                 try:
1985                         page = urllib2.urlopen(request).read()
1986                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1987                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1988                         return
1989
1990                 # Extract video identifiers
1991                 ids_in_page = []
1992
1993                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1994                         if mobj.group(1) not in ids_in_page:
1995                                 ids_in_page.append(mobj.group(1))
1996                 video_ids.extend(ids_in_page)
1997
1998                 for id in video_ids:
1999                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2000                 return
2001
2002 class PostProcessor(object):
2003         """Post Processor class.
2004
2005         PostProcessor objects can be added to downloaders with their
2006         add_post_processor() method. When the downloader has finished a
2007         successful download, it will take its internal chain of PostProcessors
2008         and start calling the run() method on each one of them, first with
2009         an initial argument and then with the returned value of the previous
2010         PostProcessor.
2011
2012         The chain will be stopped if one of them ever returns None or the end
2013         of the chain is reached.
2014
2015         PostProcessor objects follow a "mutual registration" process similar
2016         to InfoExtractor objects.
2017         """
2018
2019         _downloader = None
2020
2021         def __init__(self, downloader=None):
2022                 self._downloader = downloader
2023
2024         def set_downloader(self, downloader):
2025                 """Sets the downloader for this PP."""
2026                 self._downloader = downloader
2027
2028         def run(self, information):
2029                 """Run the PostProcessor.
2030
2031                 The "information" argument is a dictionary like the ones
2032                 composed by InfoExtractors. The only difference is that this
2033                 one has an extra field called "filepath" that points to the
2034                 downloaded file.
2035
2036                 When this method returns None, the postprocessing chain is
2037                 stopped. However, this method may return an information
2038                 dictionary that will be passed to the next postprocessing
2039                 object in the chain. It can be the one it received after
2040                 changing some fields.
2041
2042                 In addition, this method may raise a PostProcessingError
2043                 exception that will be taken into account by the downloader
2044                 it was called from.
2045                 """
2046                 return information # by default, do nothing
2047
2048 ### MAIN PROGRAM ###
2049 if __name__ == '__main__':
2050         try:
2051                 # Modules needed only when running the main program
2052                 import getpass
2053                 import optparse
2054
2055                 # Function to update the program file with the latest version from bitbucket.org
2056                 def update_self(downloader, filename):
2057                         # Note: downloader only used for options
2058                         if not os.access (filename, os.W_OK):
2059                                 sys.exit('ERROR: no write permissions on %s' % filename)
2060
2061                         downloader.to_stdout('Updating to latest stable version...')
2062                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2063                         latest_version = urllib.urlopen(latest_url).read().strip()
2064                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2065                         newcontent = urllib.urlopen(prog_url).read()
2066                         stream = open(filename, 'w')
2067                         stream.write(newcontent)
2068                         stream.close()
2069                         downloader.to_stdout('Updated to version %s' % latest_version)
2070
2071                 # General configuration
2072                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2073                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2074                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2075
2076                 # Parse command line
2077                 parser = optparse.OptionParser(
2078                         usage='Usage: %prog [options] url...',
2079                         version='2010.07.24',
2080                         conflict_handler='resolve',
2081                 )
2082
2083                 parser.add_option('-h', '--help',
2084                                 action='help', help='print this help text and exit')
2085                 parser.add_option('-v', '--version',
2086                                 action='version', help='print program version and exit')
2087                 parser.add_option('-U', '--update',
2088                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2089                 parser.add_option('-i', '--ignore-errors',
2090                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2091                 parser.add_option('-r', '--rate-limit',
2092                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2093                 parser.add_option('-R', '--retries',
2094                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2095
2096                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2097                 authentication.add_option('-u', '--username',
2098                                 dest='username', metavar='USERNAME', help='account username')
2099                 authentication.add_option('-p', '--password',
2100                                 dest='password', metavar='PASSWORD', help='account password')
2101                 authentication.add_option('-n', '--netrc',
2102                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2103                 parser.add_option_group(authentication)
2104
2105                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2106                 video_format.add_option('-f', '--format',
2107                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2108                 video_format.add_option('-m', '--mobile-version',
2109                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2110                 video_format.add_option('--all-formats',
2111                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2112                 video_format.add_option('--max-quality',
2113                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2114                 parser.add_option_group(video_format)
2115
2116                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2117                 verbosity.add_option('-q', '--quiet',
2118                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2119                 verbosity.add_option('-s', '--simulate',
2120                                 action='store_true', dest='simulate', help='do not download video', default=False)
2121                 verbosity.add_option('-g', '--get-url',
2122                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2123                 verbosity.add_option('-e', '--get-title',
2124                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2125                 verbosity.add_option('--get-thumbnail',
2126                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2127                 verbosity.add_option('--get-description',
2128                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2129                 verbosity.add_option('--no-progress',
2130                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2131                 parser.add_option_group(verbosity)
2132
2133                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2134                 filesystem.add_option('-t', '--title',
2135                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2136                 filesystem.add_option('-l', '--literal',
2137                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2138                 filesystem.add_option('-o', '--output',
2139                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2140                 filesystem.add_option('-a', '--batch-file',
2141                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2142                 filesystem.add_option('-w', '--no-overwrites',
2143                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2144                 filesystem.add_option('-c', '--continue',
2145                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2146                 parser.add_option_group(filesystem)
2147
2148                 (opts, args) = parser.parse_args()
2149
2150                 # Batch file verification
2151                 batchurls = []
2152                 if opts.batchfile is not None:
2153                         try:
2154                                 if opts.batchfile == '-':
2155                                         batchfd = sys.stdin
2156                                 else:
2157                                         batchfd = open(opts.batchfile, 'r')
2158                                 batchurls = batchfd.readlines()
2159                                 batchurls = [x.strip() for x in batchurls]
2160                                 batchurls = [x for x in batchurls if len(x) > 0]
2161                         except IOError:
2162                                 sys.exit(u'ERROR: batch file could not be read')
2163                 all_urls = batchurls + args
2164
2165                 # Conflicting, missing and erroneous options
2166                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2167                         parser.error(u'using .netrc conflicts with giving username/password')
2168                 if opts.password is not None and opts.username is None:
2169                         parser.error(u'account username missing')
2170                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2171                         parser.error(u'using output template conflicts with using title or literal title')
2172                 if opts.usetitle and opts.useliteral:
2173                         parser.error(u'using title conflicts with using literal title')
2174                 if opts.username is not None and opts.password is None:
2175                         opts.password = getpass.getpass(u'Type account password and press return:')
2176                 if opts.ratelimit is not None:
2177                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2178                         if numeric_limit is None:
2179                                 parser.error(u'invalid rate limit specified')
2180                         opts.ratelimit = numeric_limit
2181                 if opts.retries is not None:
2182                         try:
2183                                 opts.retries = long(opts.retries)
2184                         except (TypeError, ValueError), err:
2185                                 parser.error(u'invalid retry count specified')
2186
2187                 # Information extractors
2188                 youtube_ie = YoutubeIE()
2189                 metacafe_ie = MetacafeIE(youtube_ie)
2190                 dailymotion_ie = DailymotionIE()
2191                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2192                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2193                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2194                 google_ie = GoogleIE()
2195                 google_search_ie = GoogleSearchIE(google_ie)
2196                 photobucket_ie = PhotobucketIE()
2197                 yahoo_ie = YahooIE()
2198                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2199                 generic_ie = GenericIE()
2200
2201                 # File downloader
2202                 fd = FileDownloader({
2203                         'usenetrc': opts.usenetrc,
2204                         'username': opts.username,
2205                         'password': opts.password,
2206                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2207                         'forceurl': opts.geturl,
2208                         'forcetitle': opts.gettitle,
2209                         'forcethumbnail': opts.getthumbnail,
2210                         'forcedescription': opts.getdescription,
2211                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2212                         'format': opts.format,
2213                         'format_limit': opts.format_limit,
2214                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2215                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2216                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2217                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2218                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2219                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2220                                 or u'%(id)s.%(ext)s'),
2221                         'ignoreerrors': opts.ignoreerrors,
2222                         'ratelimit': opts.ratelimit,
2223                         'nooverwrites': opts.nooverwrites,
2224                         'retries': opts.retries,
2225                         'continuedl': opts.continue_dl,
2226                         'noprogress': opts.noprogress,
2227                         })
2228                 fd.add_info_extractor(youtube_search_ie)
2229                 fd.add_info_extractor(youtube_pl_ie)
2230                 fd.add_info_extractor(youtube_user_ie)
2231                 fd.add_info_extractor(metacafe_ie)
2232                 fd.add_info_extractor(dailymotion_ie)
2233                 fd.add_info_extractor(youtube_ie)
2234                 fd.add_info_extractor(google_ie)
2235                 fd.add_info_extractor(google_search_ie)
2236                 fd.add_info_extractor(photobucket_ie)
2237                 fd.add_info_extractor(yahoo_ie)
2238                 fd.add_info_extractor(yahoo_search_ie)
2239
2240                 # This must come last since it's the
2241                 # fallback if none of the others work
2242                 fd.add_info_extractor(generic_ie)
2243
2244                 # Update version
2245                 if opts.update_self:
2246                         update_self(fd, sys.argv[0])
2247
2248                 # Maybe do nothing
2249                 if len(all_urls) < 1:
2250                         if not opts.update_self:
2251                                 parser.error(u'you must provide at least one URL')
2252                         else:
2253                                 sys.exit()
2254                 retcode = fd.download(all_urls)
2255                 sys.exit(retcode)
2256
2257         except DownloadError:
2258                 sys.exit(1)
2259         except SameFileError:
2260                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2261         except KeyboardInterrupt:
2262                 sys.exit(u'\nERROR: Interrupted by user')