git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         retries:        Number of times to retry for HTTP error 503
 197         continuedl:     Try to continue downloads if possible.
 198         noprogress:     Do not print the progress bar.
 199         """
 200
 201         params = None
 202         _ies = []
 203         _pps = []
 204         _download_retcode = None
 205         _num_downloads = None
 206
 207         def __init__(self, params):
 208                 """Create a FileDownloader object with the given options."""
 209                 self._ies = []
 210                 self._pps = []
 211                 self._download_retcode = 0
 212                 self._num_downloads = 0
 213                 self.params = params
 214
 215         @staticmethod
 216         def pmkdir(filename):
 217                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 218                 components = filename.split(os.sep)
 219                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 220                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 221                 for dir in aggregate:
 222                         if not os.path.exists(dir):
 223                                 os.mkdir(dir)
 224
 225         @staticmethod
 226         def format_bytes(bytes):
 227                 if bytes is None:
 228                         return 'N/A'
 229                 if type(bytes) is str:
 230                         bytes = float(bytes)
 231                 if bytes == 0.0:
 232                         exponent = 0
 233                 else:
 234                         exponent = long(math.log(bytes, 1024.0))
 235                 suffix = 'bkMGTPEZY'[exponent]
 236                 converted = float(bytes) / float(1024**exponent)
 237                 return '%.2f%s' % (converted, suffix)
 238
 239         @staticmethod
 240         def calc_percent(byte_counter, data_len):
 241                 if data_len is None:
 242                         return '---.-%'
 243                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 244
 245         @staticmethod
 246         def calc_eta(start, now, total, current):
 247                 if total is None:
 248                         return '--:--'
 249                 dif = now - start
 250                 if current == 0 or dif < 0.001: # One millisecond
 251                         return '--:--'
 252                 rate = float(current) / dif
 253                 eta = long((float(total) - float(current)) / rate)
 254                 (eta_mins, eta_secs) = divmod(eta, 60)
 255                 if eta_mins > 99:
 256                         return '--:--'
 257                 return '%02d:%02d' % (eta_mins, eta_secs)
 258
 259         @staticmethod
 260         def calc_speed(start, now, bytes):
 261                 dif = now - start
 262                 if bytes == 0 or dif < 0.001: # One millisecond
 263                         return '%10s' % '---b/s'
 264                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 265
 266         @staticmethod
 267         def best_block_size(elapsed_time, bytes):
 268                 new_min = max(bytes / 2.0, 1.0)
 269                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 270                 if elapsed_time < 0.001:
 271                         return long(new_max)
 272                 rate = bytes / elapsed_time
 273                 if rate > new_max:
 274                         return long(new_max)
 275                 if rate < new_min:
 276                         return long(new_min)
 277                 return long(rate)
 278
 279         @staticmethod
 280         def parse_bytes(bytestr):
 281                 """Parse a string indicating a byte quantity into a long integer."""
 282                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 283                 if matchobj is None:
 284                         return None
 285                 number = float(matchobj.group(1))
 286                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 287                 return long(round(number * multiplier))
 288
 289         @staticmethod
 290         def verify_url(url):
 291                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 292                 request = urllib2.Request(url, None, std_headers)
 293                 data = urllib2.urlopen(request)
 294                 data.read(1)
 295                 url = data.geturl()
 296                 data.close()
 297                 return url
 298
 299         def add_info_extractor(self, ie):
 300                 """Add an InfoExtractor object to the end of the list."""
 301                 self._ies.append(ie)
 302                 ie.set_downloader(self)
 303
 304         def add_post_processor(self, pp):
 305                 """Add a PostProcessor object to the end of the chain."""
 306                 self._pps.append(pp)
 307                 pp.set_downloader(self)
 308
 309         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 310                 """Print message to stdout if not in quiet mode."""
 311                 try:
 312                         if not self.params.get('quiet', False):
 313                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 314                         sys.stdout.flush()
 315                 except (UnicodeEncodeError), err:
 316                         if not ignore_encoding_errors:
 317                                 raise
 318
 319         def to_stderr(self, message):
 320                 """Print message to stderr."""
 321                 print >>sys.stderr, message.encode(preferredencoding())
 322
 323         def fixed_template(self):
 324                 """Checks if the output template is fixed."""
 325                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 326
 327         def trouble(self, message=None):
 328                 """Determine action to take when a download problem appears.
 329
 330                 Depending on if the downloader has been configured to ignore
 331                 download errors or not, this method may throw an exception or
 332                 not when errors are found, after printing the message.
 333                 """
 334                 if message is not None:
 335                         self.to_stderr(message)
 336                 if not self.params.get('ignoreerrors', False):
 337                         raise DownloadError(message)
 338                 self._download_retcode = 1
 339
 340         def slow_down(self, start_time, byte_counter):
 341                 """Sleep if the download speed is over the rate limit."""
 342                 rate_limit = self.params.get('ratelimit', None)
 343                 if rate_limit is None or byte_counter == 0:
 344                         return
 345                 now = time.time()
 346                 elapsed = now - start_time
 347                 if elapsed <= 0.0:
 348                         return
 349                 speed = float(byte_counter) / elapsed
 350                 if speed > rate_limit:
 351                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 352
 353         def report_destination(self, filename):
 354                 """Report destination filename."""
 355                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 356
 357         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 358                 """Report download progress."""
 359                 if self.params.get('noprogress', False):
 360                         return
 361                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 362                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 363
 364         def report_resuming_byte(self, resume_len):
 365                 """Report attemtp to resume at given byte."""
 366                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 367
 368         def report_retry(self, count, retries):
 369                 """Report retry in case of HTTP error 503"""
 370                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
 371
 372         def report_file_already_downloaded(self, file_name):
 373                 """Report file has already been fully downloaded."""
 374                 try:
 375                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 376                 except (UnicodeEncodeError), err:
 377                         self.to_stdout(u'[download] The file has already been downloaded')
 378
 379         def report_unable_to_resume(self):
 380                 """Report it was impossible to resume download."""
 381                 self.to_stdout(u'[download] Unable to resume')
 382
 383         def report_finish(self):
 384                 """Report download finished."""
 385                 if self.params.get('noprogress', False):
 386                         self.to_stdout(u'[download] Download completed')
 387                 else:
 388                         self.to_stdout(u'')
 389
 390         def process_info(self, info_dict):
 391                 """Process a single dictionary returned by an InfoExtractor."""
 392                 # Do nothing else if in simulate mode
 393                 if self.params.get('simulate', False):
 394                         # Verify URL if it's an HTTP one
 395                         if info_dict['url'].startswith('http'):
 396                                 try:
 397                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 398                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 399                                         raise UnavailableFormatError
 400
 401                         # Forced printings
 402                         if self.params.get('forcetitle', False):
 403                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 404                         if self.params.get('forceurl', False):
 405                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 406                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 407                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 408                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 409                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 410
 411                         return
 412
 413                 try:
 414                         template_dict = dict(info_dict)
 415                         template_dict['epoch'] = unicode(long(time.time()))
 416                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 417                         filename = self.params['outtmpl'] % template_dict
 418                 except (ValueError, KeyError), err:
 419                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 420                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 421                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 422                         return
 423
 424                 try:
 425                         self.pmkdir(filename)
 426                 except (OSError, IOError), err:
 427                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 428                         return
 429
 430                 try:
 431                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 432                 except (OSError, IOError), err:
 433                         raise UnavailableFormatError
 434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 435                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 436                         return
 437                 except (ContentTooShortError, ), err:
 438                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 439                         return
 440
 441                 if success:
 442                         try:
 443                                 self.post_process(filename, info_dict)
 444                         except (PostProcessingError), err:
 445                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 446                                 return
 447
 448         def download(self, url_list):
 449                 """Download a given list of URLs."""
 450                 if len(url_list) > 1 and self.fixed_template():
 451                         raise SameFileError(self.params['outtmpl'])
 452
 453                 for url in url_list:
 454                         suitable_found = False
 455                         for ie in self._ies:
 456                                 # Go to next InfoExtractor if not suitable
 457                                 if not ie.suitable(url):
 458                                         continue
 459
 460                                 # Suitable InfoExtractor found
 461                                 suitable_found = True
 462
 463                                 # Extract information from URL and process it
 464                                 ie.extract(url)
 465
 466                                 # Suitable InfoExtractor had been found; go to next URL
 467                                 break
 468
 469                         if not suitable_found:
 470                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 471
 472                 return self._download_retcode
 473
 474         def post_process(self, filename, ie_info):
 475                 """Run the postprocessing chain on the given file."""
 476                 info = dict(ie_info)
 477                 info['filepath'] = filename
 478                 for pp in self._pps:
 479                         info = pp.run(info)
 480                         if info is None:
 481                                 break
 482
 483         def _download_with_rtmpdump(self, filename, url, player_url):
 484                 self.report_destination(filename)
 485
 486                 # Check for rtmpdump first
 487                 try:
 488                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 489                 except (OSError, IOError):
 490                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 491                         return False
 492
 493                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 494                 # the connection was interrumpted and resuming appears to be
 495                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 496                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 497                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 498                 while retval == 2 or retval == 1:
 499                         prevsize = os.path.getsize(filename)
 500                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 501                         time.sleep(5.0) # This seems to be needed
 502                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 503                         cursize = os.path.getsize(filename)
 504                         if prevsize == cursize and retval == 1:
 505                                 break
 506                 if retval == 0:
 507                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 508                         return True
 509                 else:
 510                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 511                         return False
 512
 513         def _do_download(self, filename, url, player_url):
 514                 # Attempt to download using rtmpdump
 515                 if url.startswith('rtmp'):
 516                         return self._download_with_rtmpdump(filename, url, player_url)
 517
 518                 stream = None
 519                 open_mode = 'wb'
 520                 basic_request = urllib2.Request(url, None, std_headers)
 521                 request = urllib2.Request(url, None, std_headers)
 522
 523                 # Establish possible resume length
 524                 if os.path.isfile(filename):
 525                         resume_len = os.path.getsize(filename)
 526                 else:
 527                         resume_len = 0
 528
 529                 # Request parameters in case of being able to resume
 530                 if self.params.get('continuedl', False) and resume_len != 0:
 531                         self.report_resuming_byte(resume_len)
 532                         request.add_header('Range','bytes=%d-' % resume_len)
 533                         open_mode = 'ab'
 534
 535                 count = 0
 536                 retries = self.params.get('retries', 0)
 537                 while True:
 538                         # Establish connection
 539                         try:
 540                                 data = urllib2.urlopen(request)
 541                                 break
 542                         except (urllib2.HTTPError, ), err:
 543                                 if err.code == 503:
 544                                         # Retry in case of HTTP error 503
 545                                         count += 1
 546                                         if count <= retries:
 547                                                 self.report_retry(count, retries)
 548                                                 continue
 549                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
 550                                         raise
 551                                 # Unable to resume
 552                                 data = urllib2.urlopen(basic_request)
 553                                 content_length = data.info()['Content-Length']
 554
 555                                 if content_length is not None and long(content_length) == resume_len:
 556                                         # Because the file had already been fully downloaded
 557                                         self.report_file_already_downloaded(filename)
 558                                         return True
 559                                 else:
 560                                         # Because the server didn't let us
 561                                         self.report_unable_to_resume()
 562                                         open_mode = 'wb'
 563
 564                 data_len = data.info().get('Content-length', None)
 565                 data_len_str = self.format_bytes(data_len)
 566                 byte_counter = 0
 567                 block_size = 1024
 568                 start = time.time()
 569                 while True:
 570                         # Download and write
 571                         before = time.time()
 572                         data_block = data.read(block_size)
 573                         after = time.time()
 574                         data_block_len = len(data_block)
 575                         if data_block_len == 0:
 576                                 break
 577                         byte_counter += data_block_len
 578
 579                         # Open file just in time
 580                         if stream is None:
 581                                 try:
 582                                         (stream, filename) = sanitize_open(filename, open_mode)
 583                                         self.report_destination(filename)
 584                                         self._num_downloads += 1
 585                                 except (OSError, IOError), err:
 586                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 587                                         return False
 588                         try:
 589                                 stream.write(data_block)
 590                         except (IOError, OSError), err:
 591                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 592                         block_size = self.best_block_size(after - before, data_block_len)
 593
 594                         # Progress message
 595                         percent_str = self.calc_percent(byte_counter, data_len)
 596                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 597                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 598                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 599
 600                         # Apply rate limit
 601                         self.slow_down(start, byte_counter)
 602
 603                 self.report_finish()
 604                 if data_len is not None and str(byte_counter) != data_len:
 605                         raise ContentTooShortError(byte_counter, long(data_len))
 606                 return True
 607
 608 class InfoExtractor(object):
 609         """Information Extractor class.
 610
 611         Information extractors are the classes that, given a URL, extract
 612         information from the video (or videos) the URL refers to. This
 613         information includes the real video URL, the video title and simplified
 614         title, author and others. The information is stored in a dictionary
 615         which is then passed to the FileDownloader. The FileDownloader
 616         processes this information possibly downloading the video to the file
 617         system, among other possible outcomes. The dictionaries must include
 618         the following fields:
 619
 620         id:             Video identifier.
 621         url:            Final video URL.
 622         uploader:       Nickname of the video uploader.
 623         title:          Literal title.
 624         stitle:         Simplified title.
 625         ext:            Video filename extension.
 626         format:         Video format.
 627         player_url:     SWF Player URL (may be None).
 628
 629         The following fields are optional. Their primary purpose is to allow
 630         youtube-dl to serve as the backend for a video search function, such
 631         as the one in youtube2mp3.  They are only used when their respective
 632         forced printing functions are called:
 633
 634         thumbnail:      Full URL to a video thumbnail image.
 635         description:    One-line video description.
 636
 637         Subclasses of this one should re-define the _real_initialize() and
 638         _real_extract() methods, as well as the suitable() static method.
 639         Probably, they should also be instantiated and added to the main
 640         downloader.
 641         """
 642
 643         _ready = False
 644         _downloader = None
 645
 646         def __init__(self, downloader=None):
 647                 """Constructor. Receives an optional downloader."""
 648                 self._ready = False
 649                 self.set_downloader(downloader)
 650
 651         @staticmethod
 652         def suitable(url):
 653                 """Receives a URL and returns True if suitable for this IE."""
 654                 return False
 655
 656         def initialize(self):
 657                 """Initializes an instance (authentication, etc)."""
 658                 if not self._ready:
 659                         self._real_initialize()
 660                         self._ready = True
 661
 662         def extract(self, url):
 663                 """Extracts URL information and returns it in list of dicts."""
 664                 self.initialize()
 665                 return self._real_extract(url)
 666
 667         def set_downloader(self, downloader):
 668                 """Sets the downloader for this IE."""
 669                 self._downloader = downloader
 670
 671         def _real_initialize(self):
 672                 """Real initialization process. Redefine in subclasses."""
 673                 pass
 674
 675         def _real_extract(self, url):
 676                 """Real extraction process. Redefine in subclasses."""
 677                 pass
 678
 679 class YoutubeIE(InfoExtractor):
 680         """Information extractor for youtube.com."""
 681
 682         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 683         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 684         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 685         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 686         _NETRC_MACHINE = 'youtube'
 687         # Listed in order of priority for the -b option
 688         _available_formats = ['37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13', None]
 689         _video_extensions = {
 690                 '13': '3gp',
 691                 '17': 'mp4',
 692                 '18': 'mp4',
 693                 '22': 'mp4',
 694                 '37': 'mp4',
 695                 '43': 'webm',
 696                 '45': 'webm',
 697         }
 698
 699         @staticmethod
 700         def suitable(url):
 701                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 702
 703         def report_lang(self):
 704                 """Report attempt to set language."""
 705                 self._downloader.to_stdout(u'[youtube] Setting language')
 706
 707         def report_login(self):
 708                 """Report attempt to log in."""
 709                 self._downloader.to_stdout(u'[youtube] Logging in')
 710
 711         def report_age_confirmation(self):
 712                 """Report attempt to confirm age."""
 713                 self._downloader.to_stdout(u'[youtube] Confirming age')
 714
 715         def report_video_webpage_download(self, video_id):
 716                 """Report attempt to download video webpage."""
 717                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 718
 719         def report_video_info_webpage_download(self, video_id):
 720                 """Report attempt to download video info webpage."""
 721                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 722
 723         def report_information_extraction(self, video_id):
 724                 """Report attempt to extract video information."""
 725                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 726
 727         def report_unavailable_format(self, video_id, format):
 728                 """Report extracted video URL."""
 729                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 730
 731         def report_rtmp_download(self):
 732                 """Indicate the download will use the RTMP protocol."""
 733                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 734
 735         def _real_initialize(self):
 736                 if self._downloader is None:
 737                         return
 738
 739                 username = None
 740                 password = None
 741                 downloader_params = self._downloader.params
 742
 743                 # Attempt to use provided username and password or .netrc data
 744                 if downloader_params.get('username', None) is not None:
 745                         username = downloader_params['username']
 746                         password = downloader_params['password']
 747                 elif downloader_params.get('usenetrc', False):
 748                         try:
 749                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 750                                 if info is not None:
 751                                         username = info[0]
 752                                         password = info[2]
 753                                 else:
 754                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 755                         except (IOError, netrc.NetrcParseError), err:
 756                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 757                                 return
 758
 759                 # Set language
 760                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 761                 try:
 762                         self.report_lang()
 763                         urllib2.urlopen(request).read()
 764                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 765                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 766                         return
 767
 768                 # No authentication to be performed
 769                 if username is None:
 770                         return
 771
 772                 # Log in
 773                 login_form = {
 774                                 'current_form': 'loginForm',
 775                                 'next':         '/',
 776                                 'action_login': 'Log In',
 777                                 'username':     username,
 778                                 'password':     password,
 779                                 }
 780                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 781                 try:
 782                         self.report_login()
 783                         login_results = urllib2.urlopen(request).read()
 784                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 785                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 786                                 return
 787                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 788                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 789                         return
 790
 791                 # Confirm age
 792                 age_form = {
 793                                 'next_url':             '/',
 794                                 'action_confirm':       'Confirm',
 795                                 }
 796                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 797                 try:
 798                         self.report_age_confirmation()
 799                         age_results = urllib2.urlopen(request).read()
 800                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 801                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 802                         return
 803
 804         def _real_extract(self, url):
 805                 # Extract video id from URL
 806                 mobj = re.match(self._VALID_URL, url)
 807                 if mobj is None:
 808                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 809                         return
 810                 video_id = mobj.group(2)
 811
 812                 # Downloader parameters
 813                 best_quality = False
 814                 all_formats = False
 815                 format_param = None
 816                 quality_index = 0
 817                 if self._downloader is not None:
 818                         params = self._downloader.params
 819                         format_param = params.get('format', None)
 820                         if format_param == '0':
 821                                 format_param = self._available_formats[quality_index]
 822                                 best_quality = True
 823                         elif format_param == '-1':
 824                                 format_param = self._available_formats[quality_index]
 825                                 all_formats = True
 826
 827                 while True:
 828                         # Extension
 829                         video_extension = self._video_extensions.get(format_param, 'flv')
 830
 831                         # Get video webpage
 832                         self.report_video_webpage_download(video_id)
 833                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 834                         try:
 835                                 video_webpage = urllib2.urlopen(request).read()
 836                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 837                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 838                                 return
 839
 840                         # Attempt to extract SWF player URL
 841                         mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
 842                         if mobj is not None:
 843                                 player_url = mobj.group(1)
 844                         else:
 845                                 player_url = None
 846
 847                         # Get video info
 848                         self.report_video_info_webpage_download(video_id)
 849                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 850                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 851                                                    % (video_id, el_type))
 852                                 request = urllib2.Request(video_info_url, None, std_headers)
 853                                 try:
 854                                         video_info_webpage = urllib2.urlopen(request).read()
 855                                         video_info = parse_qs(video_info_webpage)
 856                                         if 'token' in video_info:
 857                                                 break
 858                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 859                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 860                                         return
 861                         self.report_information_extraction(video_id)
 862
 863                         # "t" param
 864                         if 'token' not in video_info:
 865                                 # Attempt to see if YouTube has issued an error message
 866                                 if 'reason' not in video_info:
 867                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 868                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 869                                         stream.write(video_info_webpage)
 870                                         stream.close()
 871                                 else:
 872                                         reason = urllib.unquote_plus(video_info['reason'][0])
 873                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 874                                 return
 875                         token = urllib.unquote_plus(video_info['token'][0])
 876                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 877                         if format_param is not None:
 878                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 879
 880                         # Check possible RTMP download
 881                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 882                                 self.report_rtmp_download()
 883                                 video_real_url = video_info['conn'][0]
 884
 885                         # uploader
 886                         if 'author' not in video_info:
 887                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 888                                 return
 889                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 890
 891                         # title
 892                         if 'title' not in video_info:
 893                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 894                                 return
 895                         video_title = urllib.unquote_plus(video_info['title'][0])
 896                         video_title = video_title.decode('utf-8')
 897                         video_title = sanitize_title(video_title)
 898
 899                         # simplified title
 900                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 901                         simple_title = simple_title.strip(ur'_')
 902
 903                         # thumbnail image
 904                         if 'thumbnail_url' not in video_info:
 905                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 906                                 video_thumbnail = ''
 907                         else:   # don't panic if we can't find it
 908                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 909
 910                         # description
 911                         video_description = 'No description available.'
 912                         if self._downloader.params.get('forcedescription', False):
 913                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 914                                 if mobj is not None:
 915                                         video_description = mobj.group(1)
 916
 917                         try:
 918                                 # Process video information
 919                                 self._downloader.process_info({
 920                                         'id':           video_id.decode('utf-8'),
 921                                         'url':          video_real_url.decode('utf-8'),
 922                                         'uploader':     video_uploader.decode('utf-8'),
 923                                         'title':        video_title,
 924                                         'stitle':       simple_title,
 925                                         'ext':          video_extension.decode('utf-8'),
 926                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 927                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 928                                         'description':  video_description.decode('utf-8'),
 929                                         'player_url':   player_url,
 930                                 })
 931
 932                                 if all_formats:
 933                                         quality_index += 1
 934                                         if quality_index == len(self._available_formats):
 935                                                 # None left to get
 936                                                 return
 937                                         else:
 938                                                 format_param = self._available_formats[quality_index]
 939                                                 continue
 940                                 return
 941
 942                         except UnavailableFormatError, err:
 943                                 if best_quality or all_formats:
 944                                         quality_index += 1
 945                                         if quality_index == len(self._available_formats):
 946                                                 # I don't ever expect this to happen
 947                                                 if not all_formats:
 948                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 949                                                 return
 950                                         else:
 951                                                 self.report_unavailable_format(video_id, format_param)
 952                                                 format_param = self._available_formats[quality_index]
 953                                                 continue
 954                                 else:
 955                                         self._downloader.trouble('ERROR: format not available for video')
 956                                         return
 957
 958
 959 class MetacafeIE(InfoExtractor):
 960         """Information Extractor for metacafe.com."""
 961
 962         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 963         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 964         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 965         _youtube_ie = None
 966
 967         def __init__(self, youtube_ie, downloader=None):
 968                 InfoExtractor.__init__(self, downloader)
 969                 self._youtube_ie = youtube_ie
 970
 971         @staticmethod
 972         def suitable(url):
 973                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 974
 975         def report_disclaimer(self):
 976                 """Report disclaimer retrieval."""
 977                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 978
 979         def report_age_confirmation(self):
 980                 """Report attempt to confirm age."""
 981                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 982
 983         def report_download_webpage(self, video_id):
 984                 """Report webpage download."""
 985                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 986
 987         def report_extraction(self, video_id):
 988                 """Report information extraction."""
 989                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 990
 991         def _real_initialize(self):
 992                 # Retrieve disclaimer
 993                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 994                 try:
 995                         self.report_disclaimer()
 996                         disclaimer = urllib2.urlopen(request).read()
 997                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 998                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 999                         return
1000
1001                 # Confirm age
1002                 disclaimer_form = {
1003                         'filters': '0',
1004                         'submit': "Continue - I'm over 18",
1005                         }
1006                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1007                 try:
1008                         self.report_age_confirmation()
1009                         disclaimer = urllib2.urlopen(request).read()
1010                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1011                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1012                         return
1013
1014         def _real_extract(self, url):
1015                 # Extract id and simplified title from URL
1016                 mobj = re.match(self._VALID_URL, url)
1017                 if mobj is None:
1018                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1019                         return
1020
1021                 video_id = mobj.group(1)
1022
1023                 # Check if video comes from YouTube
1024                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1025                 if mobj2 is not None:
1026                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1027                         return
1028
1029                 simple_title = mobj.group(2).decode('utf-8')
1030                 video_extension = 'flv'
1031
1032                 # Retrieve video webpage to extract further information
1033                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1034                 try:
1035                         self.report_download_webpage(video_id)
1036                         webpage = urllib2.urlopen(request).read()
1037                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1038                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1039                         return
1040
1041                 # Extract URL, uploader and title from webpage
1042                 self.report_extraction(video_id)
1043                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1044                 if mobj is None:
1045                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1046                         return
1047                 mediaURL = urllib.unquote(mobj.group(1))
1048
1049                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1050                 #if mobj is None:
1051                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1052                 #       return
1053                 #gdaKey = mobj.group(1)
1054                 #
1055                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1056
1057                 video_url = mediaURL
1058
1059                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1060                 if mobj is None:
1061                         self._downloader.trouble(u'ERROR: unable to extract title')
1062                         return
1063                 video_title = mobj.group(1).decode('utf-8')
1064                 video_title = sanitize_title(video_title)
1065
1066                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1067                 if mobj is None:
1068                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1069                         return
1070                 video_uploader = mobj.group(1)
1071
1072                 try:
1073                         # Process video information
1074                         self._downloader.process_info({
1075                                 'id':           video_id.decode('utf-8'),
1076                                 'url':          video_url.decode('utf-8'),
1077                                 'uploader':     video_uploader.decode('utf-8'),
1078                                 'title':        video_title,
1079                                 'stitle':       simple_title,
1080                                 'ext':          video_extension.decode('utf-8'),
1081                                 'format':       u'NA',
1082                                 'player_url':   None,
1083                         })
1084                 except UnavailableFormatError:
1085                         self._downloader.trouble(u'ERROR: format not available for video')
1086
1087
1088 class DailymotionIE(InfoExtractor):
1089         """Information Extractor for Dailymotion"""
1090
1091         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1092
1093         def __init__(self, downloader=None):
1094                 InfoExtractor.__init__(self, downloader)
1095
1096         @staticmethod
1097         def suitable(url):
1098                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1099
1100         def report_download_webpage(self, video_id):
1101                 """Report webpage download."""
1102                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1103
1104         def report_extraction(self, video_id):
1105                 """Report information extraction."""
1106                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1107
1108         def _real_initialize(self):
1109                 return
1110
1111         def _real_extract(self, url):
1112                 # Extract id and simplified title from URL
1113                 mobj = re.match(self._VALID_URL, url)
1114                 if mobj is None:
1115                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1116                         return
1117
1118                 video_id = mobj.group(1)
1119
1120                 simple_title = mobj.group(2).decode('utf-8')
1121                 video_extension = 'flv'
1122
1123                 # Retrieve video webpage to extract further information
1124                 request = urllib2.Request(url)
1125                 try:
1126                         self.report_download_webpage(video_id)
1127                         webpage = urllib2.urlopen(request).read()
1128                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1129                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1130                         return
1131
1132                 # Extract URL, uploader and title from webpage
1133                 self.report_extraction(video_id)
1134                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1135                 if mobj is None:
1136                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1137                         return
1138                 mediaURL = urllib.unquote(mobj.group(1))
1139
1140                 # if needed add http://www.dailymotion.com/ if relative URL
1141
1142                 video_url = mediaURL
1143
1144                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1145                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1146                 if mobj is None:
1147                         self._downloader.trouble(u'ERROR: unable to extract title')
1148                         return
1149                 video_title = mobj.group(1).decode('utf-8')
1150                 video_title = sanitize_title(video_title)
1151
1152                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1153                 if mobj is None:
1154                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1155                         return
1156                 video_uploader = mobj.group(1)
1157
1158                 try:
1159                         # Process video information
1160                         self._downloader.process_info({
1161                                 'id':           video_id.decode('utf-8'),
1162                                 'url':          video_url.decode('utf-8'),
1163                                 'uploader':     video_uploader.decode('utf-8'),
1164                                 'title':        video_title,
1165                                 'stitle':       simple_title,
1166                                 'ext':          video_extension.decode('utf-8'),
1167                                 'format':       u'NA',
1168                                 'player_url':   None,
1169                         })
1170                 except UnavailableFormatError:
1171                         self._downloader.trouble(u'ERROR: format not available for video')
1172
1173 class GoogleIE(InfoExtractor):
1174         """Information extractor for video.google.com."""
1175
1176         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1177
1178         def __init__(self, downloader=None):
1179                 InfoExtractor.__init__(self, downloader)
1180
1181         @staticmethod
1182         def suitable(url):
1183                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1184
1185         def report_download_webpage(self, video_id):
1186                 """Report webpage download."""
1187                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1188
1189         def report_extraction(self, video_id):
1190                 """Report information extraction."""
1191                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1192
1193         def _real_initialize(self):
1194                 return
1195
1196         def _real_extract(self, url):
1197                 # Extract id from URL
1198                 mobj = re.match(self._VALID_URL, url)
1199                 if mobj is None:
1200                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1201                         return
1202
1203                 video_id = mobj.group(1)
1204
1205                 video_extension = 'mp4'
1206
1207                 # Retrieve video webpage to extract further information
1208                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1209                 try:
1210                         self.report_download_webpage(video_id)
1211                         webpage = urllib2.urlopen(request).read()
1212                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1214                         return
1215
1216                 # Extract URL, uploader, and title from webpage
1217                 self.report_extraction(video_id)
1218                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1219                 if mobj is None:
1220                         video_extension = 'flv'
1221                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1222                 if mobj is None:
1223                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1224                         return
1225                 mediaURL = urllib.unquote(mobj.group(1))
1226                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1227                 mediaURL = mediaURL.replace('\\x26', '\x26')
1228
1229                 video_url = mediaURL
1230
1231                 mobj = re.search(r'<title>(.*)</title>', webpage)
1232                 if mobj is None:
1233                         self._downloader.trouble(u'ERROR: unable to extract title')
1234                         return
1235                 video_title = mobj.group(1).decode('utf-8')
1236                 video_title = sanitize_title(video_title)
1237                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1238
1239                 # Extract video description
1240                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1241                 if mobj is None:
1242                         self._downloader.trouble(u'ERROR: unable to extract video description')
1243                         return
1244                 video_description = mobj.group(1).decode('utf-8')
1245                 if not video_description:
1246                         video_description = 'No description available.'
1247
1248                 # Extract video thumbnail
1249                 if self._downloader.params.get('forcethumbnail', False):
1250                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1251                         try:
1252                                 webpage = urllib2.urlopen(request).read()
1253                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1254                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1255                                 return
1256                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1257                         if mobj is None:
1258                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1259                                 return
1260                         video_thumbnail = mobj.group(1)
1261                 else:   # we need something to pass to process_info
1262                         video_thumbnail = ''
1263
1264
1265                 try:
1266                         # Process video information
1267                         self._downloader.process_info({
1268                                 'id':           video_id.decode('utf-8'),
1269                                 'url':          video_url.decode('utf-8'),
1270                                 'uploader':     u'NA',
1271                                 'title':        video_title,
1272                                 'stitle':       simple_title,
1273                                 'ext':          video_extension.decode('utf-8'),
1274                                 'format':       u'NA',
1275                                 'player_url':   None,
1276                         })
1277                 except UnavailableFormatError:
1278                         self._downloader.trouble(u'ERROR: format not available for video')
1279
1280
1281 class PhotobucketIE(InfoExtractor):
1282         """Information extractor for photobucket.com."""
1283
1284         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1285
1286         def __init__(self, downloader=None):
1287                 InfoExtractor.__init__(self, downloader)
1288
1289         @staticmethod
1290         def suitable(url):
1291                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1292
1293         def report_download_webpage(self, video_id):
1294                 """Report webpage download."""
1295                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1296
1297         def report_extraction(self, video_id):
1298                 """Report information extraction."""
1299                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1300
1301         def _real_initialize(self):
1302                 return
1303
1304         def _real_extract(self, url):
1305                 # Extract id from URL
1306                 mobj = re.match(self._VALID_URL, url)
1307                 if mobj is None:
1308                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1309                         return
1310
1311                 video_id = mobj.group(1)
1312
1313                 video_extension = 'flv'
1314
1315                 # Retrieve video webpage to extract further information
1316                 request = urllib2.Request(url)
1317                 try:
1318                         self.report_download_webpage(video_id)
1319                         webpage = urllib2.urlopen(request).read()
1320                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1321                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1322                         return
1323
1324                 # Extract URL, uploader, and title from webpage
1325                 self.report_extraction(video_id)
1326                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1327                 if mobj is None:
1328                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1329                         return
1330                 mediaURL = urllib.unquote(mobj.group(1))
1331
1332                 video_url = mediaURL
1333
1334                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1335                 if mobj is None:
1336                         self._downloader.trouble(u'ERROR: unable to extract title')
1337                         return
1338                 video_title = mobj.group(1).decode('utf-8')
1339                 video_title = sanitize_title(video_title)
1340                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1341
1342                 video_uploader = mobj.group(2).decode('utf-8')
1343
1344                 try:
1345                         # Process video information
1346                         self._downloader.process_info({
1347                                 'id':           video_id.decode('utf-8'),
1348                                 'url':          video_url.decode('utf-8'),
1349                                 'uploader':     video_uploader,
1350                                 'title':        video_title,
1351                                 'stitle':       simple_title,
1352                                 'ext':          video_extension.decode('utf-8'),
1353                                 'format':       u'NA',
1354                                 'player_url':   None,
1355                         })
1356                 except UnavailableFormatError:
1357                         self._downloader.trouble(u'ERROR: format not available for video')
1358
1359
1360 class YahooIE(InfoExtractor):
1361         """Information extractor for video.yahoo.com."""
1362
1363         # _VALID_URL matches all Yahoo! Video URLs
1364         # _VPAGE_URL matches only the extractable '/watch/' URLs
1365         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1366         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1367
1368         def __init__(self, downloader=None):
1369                 InfoExtractor.__init__(self, downloader)
1370
1371         @staticmethod
1372         def suitable(url):
1373                 return (re.match(YahooIE._VALID_URL, url) is not None)
1374
1375         def report_download_webpage(self, video_id):
1376                 """Report webpage download."""
1377                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1378
1379         def report_extraction(self, video_id):
1380                 """Report information extraction."""
1381                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1382
1383         def _real_initialize(self):
1384                 return
1385
1386         def _real_extract(self, url):
1387                 # Extract ID from URL
1388                 mobj = re.match(self._VALID_URL, url)
1389                 if mobj is None:
1390                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1391                         return
1392
1393                 video_id = mobj.group(2)
1394                 video_extension = 'flv'
1395
1396                 # Rewrite valid but non-extractable URLs as
1397                 # extractable English language /watch/ URLs
1398                 if re.match(self._VPAGE_URL, url) is None:
1399                         request = urllib2.Request(url)
1400                         try:
1401                                 webpage = urllib2.urlopen(request).read()
1402                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1403                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1404                                 return
1405
1406                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1407                         if mobj is None:
1408                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1409                                 return
1410                         yahoo_id = mobj.group(1)
1411
1412                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1413                         if mobj is None:
1414                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1415                                 return
1416                         yahoo_vid = mobj.group(1)
1417
1418                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1419                         return self._real_extract(url)
1420
1421                 # Retrieve video webpage to extract further information
1422                 request = urllib2.Request(url)
1423                 try:
1424                         self.report_download_webpage(video_id)
1425                         webpage = urllib2.urlopen(request).read()
1426                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1427                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1428                         return
1429
1430                 # Extract uploader and title from webpage
1431                 self.report_extraction(video_id)
1432                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1433                 if mobj is None:
1434                         self._downloader.trouble(u'ERROR: unable to extract video title')
1435                         return
1436                 video_title = mobj.group(1).decode('utf-8')
1437                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1438
1439                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1440                 if mobj is None:
1441                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1442                         return
1443                 video_uploader = mobj.group(1).decode('utf-8')
1444
1445                 # Extract video thumbnail
1446                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1447                 if mobj is None:
1448                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1449                         return
1450                 video_thumbnail = mobj.group(1).decode('utf-8')
1451
1452                 # Extract video description
1453                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1454                 if mobj is None:
1455                         self._downloader.trouble(u'ERROR: unable to extract video description')
1456                         return
1457                 video_description = mobj.group(1).decode('utf-8')
1458                 if not video_description: video_description = 'No description available.'
1459
1460                 # Extract video height and width
1461                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: unable to extract video height')
1464                         return
1465                 yv_video_height = mobj.group(1)
1466
1467                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1468                 if mobj is None:
1469                         self._downloader.trouble(u'ERROR: unable to extract video width')
1470                         return
1471                 yv_video_width = mobj.group(1)
1472
1473                 # Retrieve video playlist to extract media URL
1474                 # I'm not completely sure what all these options are, but we
1475                 # seem to need most of them, otherwise the server sends a 401.
1476                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1477                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1478                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1479                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1480                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1481                 try:
1482                         self.report_download_webpage(video_id)
1483                         webpage = urllib2.urlopen(request).read()
1484                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1485                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1486                         return
1487
1488                 # Extract media URL from playlist XML
1489                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1490                 if mobj is None:
1491                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1492                         return
1493                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1494                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1495
1496                 try:
1497                         # Process video information
1498                         self._downloader.process_info({
1499                                 'id':           video_id.decode('utf-8'),
1500                                 'url':          video_url,
1501                                 'uploader':     video_uploader,
1502                                 'title':        video_title,
1503                                 'stitle':       simple_title,
1504                                 'ext':          video_extension.decode('utf-8'),
1505                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1506                                 'description':  video_description,
1507                                 'thumbnail':    video_thumbnail,
1508                                 'description':  video_description,
1509                                 'player_url':   None,
1510                         })
1511                 except UnavailableFormatError:
1512                         self._downloader.trouble(u'ERROR: format not available for video')
1513
1514
1515 class GenericIE(InfoExtractor):
1516         """Generic last-resort information extractor."""
1517
1518         def __init__(self, downloader=None):
1519                 InfoExtractor.__init__(self, downloader)
1520
1521         @staticmethod
1522         def suitable(url):
1523                 return True
1524
1525         def report_download_webpage(self, video_id):
1526                 """Report webpage download."""
1527                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1528                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1529
1530         def report_extraction(self, video_id):
1531                 """Report information extraction."""
1532                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1533
1534         def _real_initialize(self):
1535                 return
1536
1537         def _real_extract(self, url):
1538                 video_id = url.split('/')[-1]
1539                 request = urllib2.Request(url)
1540                 try:
1541                         self.report_download_webpage(video_id)
1542                         webpage = urllib2.urlopen(request).read()
1543                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1544                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1545                         return
1546                 except ValueError, err:
1547                         # since this is the last-resort InfoExtractor, if
1548                         # this error is thrown, it'll be thrown here
1549                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1550                         return
1551
1552                 # Start with something easy: JW Player in SWFObject
1553                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1554                 if mobj is None:
1555                         # Broaden the search a little bit
1556                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1557                 if mobj is None:
1558                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1559                         return
1560
1561                 # It's possible that one of the regexes
1562                 # matched, but returned an empty group:
1563                 if mobj.group(1) is None:
1564                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1565                         return
1566
1567                 video_url = urllib.unquote(mobj.group(1))
1568                 video_id  = os.path.basename(video_url)
1569
1570                 # here's a fun little line of code for you:
1571                 video_extension = os.path.splitext(video_id)[1][1:]
1572                 video_id        = os.path.splitext(video_id)[0]
1573
1574                 # it's tempting to parse this further, but you would
1575                 # have to take into account all the variations like
1576                 #   Video Title - Site Name
1577                 #   Site Name | Video Title
1578                 #   Video Title - Tagline | Site Name
1579                 # and so on and so forth; it's just not practical
1580                 mobj = re.search(r'<title>(.*)</title>', webpage)
1581                 if mobj is None:
1582                         self._downloader.trouble(u'ERROR: unable to extract title')
1583                         return
1584                 video_title = mobj.group(1).decode('utf-8')
1585                 video_title = sanitize_title(video_title)
1586                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1587
1588                 # video uploader is domain name
1589                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1590                 if mobj is None:
1591                         self._downloader.trouble(u'ERROR: unable to extract title')
1592                         return
1593                 video_uploader = mobj.group(1).decode('utf-8')
1594
1595                 try:
1596                         # Process video information
1597                         self._downloader.process_info({
1598                                 'id':           video_id.decode('utf-8'),
1599                                 'url':          video_url.decode('utf-8'),
1600                                 'uploader':     video_uploader,
1601                                 'title':        video_title,
1602                                 'stitle':       simple_title,
1603                                 'ext':          video_extension.decode('utf-8'),
1604                                 'format':       u'NA',
1605                                 'player_url':   None,
1606                         })
1607                 except UnavailableFormatError:
1608                         self._downloader.trouble(u'ERROR: format not available for video')
1609
1610
1611 class YoutubeSearchIE(InfoExtractor):
1612         """Information Extractor for YouTube search queries."""
1613         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1614         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1615         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1616         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1617         _youtube_ie = None
1618         _max_youtube_results = 1000
1619
1620         def __init__(self, youtube_ie, downloader=None):
1621                 InfoExtractor.__init__(self, downloader)
1622                 self._youtube_ie = youtube_ie
1623
1624         @staticmethod
1625         def suitable(url):
1626                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1627
1628         def report_download_page(self, query, pagenum):
1629                 """Report attempt to download playlist page with given number."""
1630                 query = query.decode(preferredencoding())
1631                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1632
1633         def _real_initialize(self):
1634                 self._youtube_ie.initialize()
1635
1636         def _real_extract(self, query):
1637                 mobj = re.match(self._VALID_QUERY, query)
1638                 if mobj is None:
1639                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1640                         return
1641
1642                 prefix, query = query.split(':')
1643                 prefix = prefix[8:]
1644                 query  = query.encode('utf-8')
1645                 if prefix == '':
1646                         self._download_n_results(query, 1)
1647                         return
1648                 elif prefix == 'all':
1649                         self._download_n_results(query, self._max_youtube_results)
1650                         return
1651                 else:
1652                         try:
1653                                 n = long(prefix)
1654                                 if n <= 0:
1655                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1656                                         return
1657                                 elif n > self._max_youtube_results:
1658                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1659                                         n = self._max_youtube_results
1660                                 self._download_n_results(query, n)
1661                                 return
1662                         except ValueError: # parsing prefix as integer fails
1663                                 self._download_n_results(query, 1)
1664                                 return
1665
1666         def _download_n_results(self, query, n):
1667                 """Downloads a specified number of results for a query"""
1668
1669                 video_ids = []
1670                 already_seen = set()
1671                 pagenum = 1
1672
1673                 while True:
1674                         self.report_download_page(query, pagenum)
1675                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1676                         request = urllib2.Request(result_url, None, std_headers)
1677                         try:
1678                                 page = urllib2.urlopen(request).read()
1679                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1680                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1681                                 return
1682
1683                         # Extract video identifiers
1684                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1685                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1686                                 if video_id not in already_seen:
1687                                         video_ids.append(video_id)
1688                                         already_seen.add(video_id)
1689                                         if len(video_ids) == n:
1690                                                 # Specified n videos reached
1691                                                 for id in video_ids:
1692                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1693                                                 return
1694
1695                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1696                                 for id in video_ids:
1697                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1698                                 return
1699
1700                         pagenum = pagenum + 1
1701
1702 class GoogleSearchIE(InfoExtractor):
1703         """Information Extractor for Google Video search queries."""
1704         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1705         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1706         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1707         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1708         _google_ie = None
1709         _max_google_results = 1000
1710
1711         def __init__(self, google_ie, downloader=None):
1712                 InfoExtractor.__init__(self, downloader)
1713                 self._google_ie = google_ie
1714
1715         @staticmethod
1716         def suitable(url):
1717                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1718
1719         def report_download_page(self, query, pagenum):
1720                 """Report attempt to download playlist page with given number."""
1721                 query = query.decode(preferredencoding())
1722                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1723
1724         def _real_initialize(self):
1725                 self._google_ie.initialize()
1726
1727         def _real_extract(self, query):
1728                 mobj = re.match(self._VALID_QUERY, query)
1729                 if mobj is None:
1730                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1731                         return
1732
1733                 prefix, query = query.split(':')
1734                 prefix = prefix[8:]
1735                 query  = query.encode('utf-8')
1736                 if prefix == '':
1737                         self._download_n_results(query, 1)
1738                         return
1739                 elif prefix == 'all':
1740                         self._download_n_results(query, self._max_google_results)
1741                         return
1742                 else:
1743                         try:
1744                                 n = long(prefix)
1745                                 if n <= 0:
1746                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1747                                         return
1748                                 elif n > self._max_google_results:
1749                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1750                                         n = self._max_google_results
1751                                 self._download_n_results(query, n)
1752                                 return
1753                         except ValueError: # parsing prefix as integer fails
1754                                 self._download_n_results(query, 1)
1755                                 return
1756
1757         def _download_n_results(self, query, n):
1758                 """Downloads a specified number of results for a query"""
1759
1760                 video_ids = []
1761                 already_seen = set()
1762                 pagenum = 1
1763
1764                 while True:
1765                         self.report_download_page(query, pagenum)
1766                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1767                         request = urllib2.Request(result_url, None, std_headers)
1768                         try:
1769                                 page = urllib2.urlopen(request).read()
1770                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1771                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1772                                 return
1773
1774                         # Extract video identifiers
1775                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1776                                 video_id = mobj.group(1)
1777                                 if video_id not in already_seen:
1778                                         video_ids.append(video_id)
1779                                         already_seen.add(video_id)
1780                                         if len(video_ids) == n:
1781                                                 # Specified n videos reached
1782                                                 for id in video_ids:
1783                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1784                                                 return
1785
1786                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1787                                 for id in video_ids:
1788                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1789                                 return
1790
1791                         pagenum = pagenum + 1
1792
1793 class YahooSearchIE(InfoExtractor):
1794         """Information Extractor for Yahoo! Video search queries."""
1795         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1796         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1797         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1798         _MORE_PAGES_INDICATOR = r'\s*Next'
1799         _yahoo_ie = None
1800         _max_yahoo_results = 1000
1801
1802         def __init__(self, yahoo_ie, downloader=None):
1803                 InfoExtractor.__init__(self, downloader)
1804                 self._yahoo_ie = yahoo_ie
1805
1806         @staticmethod
1807         def suitable(url):
1808                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1809
1810         def report_download_page(self, query, pagenum):
1811                 """Report attempt to download playlist page with given number."""
1812                 query = query.decode(preferredencoding())
1813                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1814
1815         def _real_initialize(self):
1816                 self._yahoo_ie.initialize()
1817
1818         def _real_extract(self, query):
1819                 mobj = re.match(self._VALID_QUERY, query)
1820                 if mobj is None:
1821                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1822                         return
1823
1824                 prefix, query = query.split(':')
1825                 prefix = prefix[8:]
1826                 query  = query.encode('utf-8')
1827                 if prefix == '':
1828                         self._download_n_results(query, 1)
1829                         return
1830                 elif prefix == 'all':
1831                         self._download_n_results(query, self._max_yahoo_results)
1832                         return
1833                 else:
1834                         try:
1835                                 n = long(prefix)
1836                                 if n <= 0:
1837                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1838                                         return
1839                                 elif n > self._max_yahoo_results:
1840                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1841                                         n = self._max_yahoo_results
1842                                 self._download_n_results(query, n)
1843                                 return
1844                         except ValueError: # parsing prefix as integer fails
1845                                 self._download_n_results(query, 1)
1846                                 return
1847
1848         def _download_n_results(self, query, n):
1849                 """Downloads a specified number of results for a query"""
1850
1851                 video_ids = []
1852                 already_seen = set()
1853                 pagenum = 1
1854
1855                 while True:
1856                         self.report_download_page(query, pagenum)
1857                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1858                         request = urllib2.Request(result_url, None, std_headers)
1859                         try:
1860                                 page = urllib2.urlopen(request).read()
1861                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1862                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1863                                 return
1864
1865                         # Extract video identifiers
1866                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1867                                 video_id = mobj.group(1)
1868                                 if video_id not in already_seen:
1869                                         video_ids.append(video_id)
1870                                         already_seen.add(video_id)
1871                                         if len(video_ids) == n:
1872                                                 # Specified n videos reached
1873                                                 for id in video_ids:
1874                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1875                                                 return
1876
1877                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1878                                 for id in video_ids:
1879                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1880                                 return
1881
1882                         pagenum = pagenum + 1
1883
1884 class YoutubePlaylistIE(InfoExtractor):
1885         """Information Extractor for YouTube playlists."""
1886
1887         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1888         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1889         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1890         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1891         _youtube_ie = None
1892
1893         def __init__(self, youtube_ie, downloader=None):
1894                 InfoExtractor.__init__(self, downloader)
1895                 self._youtube_ie = youtube_ie
1896
1897         @staticmethod
1898         def suitable(url):
1899                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1900
1901         def report_download_page(self, playlist_id, pagenum):
1902                 """Report attempt to download playlist page with given number."""
1903                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1904
1905         def _real_initialize(self):
1906                 self._youtube_ie.initialize()
1907
1908         def _real_extract(self, url):
1909                 # Extract playlist id
1910                 mobj = re.match(self._VALID_URL, url)
1911                 if mobj is None:
1912                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1913                         return
1914
1915                 # Download playlist pages
1916                 playlist_id = mobj.group(1)
1917                 video_ids = []
1918                 pagenum = 1
1919
1920                 while True:
1921                         self.report_download_page(playlist_id, pagenum)
1922                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1923                         try:
1924                                 page = urllib2.urlopen(request).read()
1925                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1926                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1927                                 return
1928
1929                         # Extract video identifiers
1930                         ids_in_page = []
1931                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1932                                 if mobj.group(1) not in ids_in_page:
1933                                         ids_in_page.append(mobj.group(1))
1934                         video_ids.extend(ids_in_page)
1935
1936                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1937                                 break
1938                         pagenum = pagenum + 1
1939
1940                 for id in video_ids:
1941                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1942                 return
1943
1944 class YoutubeUserIE(InfoExtractor):
1945         """Information Extractor for YouTube users."""
1946
1947         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1948         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1949         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1950         _youtube_ie = None
1951
1952         def __init__(self, youtube_ie, downloader=None):
1953                 InfoExtractor.__init__(self, downloader)
1954                 self._youtube_ie = youtube_ie
1955
1956         @staticmethod
1957         def suitable(url):
1958                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1959
1960         def report_download_page(self, username):
1961                 """Report attempt to download user page."""
1962                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1963
1964         def _real_initialize(self):
1965                 self._youtube_ie.initialize()
1966
1967         def _real_extract(self, url):
1968                 # Extract username
1969                 mobj = re.match(self._VALID_URL, url)
1970                 if mobj is None:
1971                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1972                         return
1973
1974                 # Download user page
1975                 username = mobj.group(1)
1976                 video_ids = []
1977                 pagenum = 1
1978
1979                 self.report_download_page(username)
1980                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1981                 try:
1982                         page = urllib2.urlopen(request).read()
1983                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1984                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1985                         return
1986
1987                 # Extract video identifiers
1988                 ids_in_page = []
1989
1990                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1991                         if mobj.group(1) not in ids_in_page:
1992                                 ids_in_page.append(mobj.group(1))
1993                 video_ids.extend(ids_in_page)
1994
1995                 for id in video_ids:
1996                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1997                 return
1998
1999 class PostProcessor(object):
2000         """Post Processor class.
2001
2002         PostProcessor objects can be added to downloaders with their
2003         add_post_processor() method. When the downloader has finished a
2004         successful download, it will take its internal chain of PostProcessors
2005         and start calling the run() method on each one of them, first with
2006         an initial argument and then with the returned value of the previous
2007         PostProcessor.
2008
2009         The chain will be stopped if one of them ever returns None or the end
2010         of the chain is reached.
2011
2012         PostProcessor objects follow a "mutual registration" process similar
2013         to InfoExtractor objects.
2014         """
2015
2016         _downloader = None
2017
2018         def __init__(self, downloader=None):
2019                 self._downloader = downloader
2020
2021         def set_downloader(self, downloader):
2022                 """Sets the downloader for this PP."""
2023                 self._downloader = downloader
2024
2025         def run(self, information):
2026                 """Run the PostProcessor.
2027
2028                 The "information" argument is a dictionary like the ones
2029                 composed by InfoExtractors. The only difference is that this
2030                 one has an extra field called "filepath" that points to the
2031                 downloaded file.
2032
2033                 When this method returns None, the postprocessing chain is
2034                 stopped. However, this method may return an information
2035                 dictionary that will be passed to the next postprocessing
2036                 object in the chain. It can be the one it received after
2037                 changing some fields.
2038
2039                 In addition, this method may raise a PostProcessingError
2040                 exception that will be taken into account by the downloader
2041                 it was called from.
2042                 """
2043                 return information # by default, do nothing
2044
2045 ### MAIN PROGRAM ###
2046 if __name__ == '__main__':
2047         try:
2048                 # Modules needed only when running the main program
2049                 import getpass
2050                 import optparse
2051
2052                 # Function to update the program file with the latest version from bitbucket.org
2053                 def update_self(downloader, filename):
2054                         # Note: downloader only used for options
2055                         if not os.access (filename, os.W_OK):
2056                                 sys.exit('ERROR: no write permissions on %s' % filename)
2057
2058                         downloader.to_stdout('Updating to latest stable version...')
2059                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2060                         latest_version = urllib.urlopen(latest_url).read().strip()
2061                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2062                         newcontent = urllib.urlopen(prog_url).read()
2063                         stream = open(filename, 'w')
2064                         stream.write(newcontent)
2065                         stream.close()
2066                         downloader.to_stdout('Updated to version %s' % latest_version)
2067
2068                 # General configuration
2069                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2070                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2071                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2072
2073                 # Parse command line
2074                 parser = optparse.OptionParser(
2075                         usage='Usage: %prog [options] url...',
2076                         version='2010.06.06',
2077                         conflict_handler='resolve',
2078                 )
2079
2080                 parser.add_option('-h', '--help',
2081                                 action='help', help='print this help text and exit')
2082                 parser.add_option('-v', '--version',
2083                                 action='version', help='print program version and exit')
2084                 parser.add_option('-U', '--update',
2085                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2086                 parser.add_option('-i', '--ignore-errors',
2087                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2088                 parser.add_option('-r', '--rate-limit',
2089                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2090                 parser.add_option('-R', '--retries',
2091                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2092
2093                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2094                 authentication.add_option('-u', '--username',
2095                                 dest='username', metavar='USERNAME', help='account username')
2096                 authentication.add_option('-p', '--password',
2097                                 dest='password', metavar='PASSWORD', help='account password')
2098                 authentication.add_option('-n', '--netrc',
2099                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2100                 parser.add_option_group(authentication)
2101
2102                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2103                 video_format.add_option('-f', '--format',
2104                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2105                 video_format.add_option('-b', '--best-quality',
2106                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
2107                 video_format.add_option('-m', '--mobile-version',
2108                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2109                 video_format.add_option('-d', '--high-def',
2110                                 action='store_const', dest='format', help='alias for -f 22', const='22')
2111                 video_format.add_option('--all-formats',
2112                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2113                 parser.add_option_group(video_format)
2114
2115                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2116                 verbosity.add_option('-q', '--quiet',
2117                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2118                 verbosity.add_option('-s', '--simulate',
2119                                 action='store_true', dest='simulate', help='do not download video', default=False)
2120                 verbosity.add_option('-g', '--get-url',
2121                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2122                 verbosity.add_option('-e', '--get-title',
2123                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2124                 verbosity.add_option('--get-thumbnail',
2125                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2126                 verbosity.add_option('--get-description',
2127                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2128                 verbosity.add_option('--no-progress',
2129                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2130                 parser.add_option_group(verbosity)
2131
2132                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2133                 filesystem.add_option('-t', '--title',
2134                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2135                 filesystem.add_option('-l', '--literal',
2136                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2137                 filesystem.add_option('-o', '--output',
2138                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2139                 filesystem.add_option('-a', '--batch-file',
2140                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2141                 filesystem.add_option('-w', '--no-overwrites',
2142                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2143                 filesystem.add_option('-c', '--continue',
2144                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2145                 parser.add_option_group(filesystem)
2146
2147                 (opts, args) = parser.parse_args()
2148
2149                 # Batch file verification
2150                 batchurls = []
2151                 if opts.batchfile is not None:
2152                         try:
2153                                 if opts.batchfile == '-':
2154                                         batchfd = sys.stdin
2155                                 else:
2156                                         batchfd = open(opts.batchfile, 'r')
2157                                 batchurls = batchfd.readlines()
2158                                 batchurls = [x.strip() for x in batchurls]
2159                                 batchurls = [x for x in batchurls if len(x) > 0]
2160                         except IOError:
2161                                 sys.exit(u'ERROR: batch file could not be read')
2162                 all_urls = batchurls + args
2163
2164                 # Conflicting, missing and erroneous options
2165                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2166                         parser.error(u'using .netrc conflicts with giving username/password')
2167                 if opts.password is not None and opts.username is None:
2168                         parser.error(u'account username missing')
2169                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2170                         parser.error(u'using output template conflicts with using title or literal title')
2171                 if opts.usetitle and opts.useliteral:
2172                         parser.error(u'using title conflicts with using literal title')
2173                 if opts.username is not None and opts.password is None:
2174                         opts.password = getpass.getpass(u'Type account password and press return:')
2175                 if opts.ratelimit is not None:
2176                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2177                         if numeric_limit is None:
2178                                 parser.error(u'invalid rate limit specified')
2179                         opts.ratelimit = numeric_limit
2180                 if opts.retries is not None:
2181                         try:
2182                                 opts.retries = long(opts.retries)
2183                         except (TypeError, ValueError), err:
2184                                 parser.error(u'invalid retry count specified')
2185
2186                 # Information extractors
2187                 youtube_ie = YoutubeIE()
2188                 metacafe_ie = MetacafeIE(youtube_ie)
2189                 dailymotion_ie = DailymotionIE()
2190                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2191                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2192                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2193                 google_ie = GoogleIE()
2194                 google_search_ie = GoogleSearchIE(google_ie)
2195                 photobucket_ie = PhotobucketIE()
2196                 yahoo_ie = YahooIE()
2197                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2198                 generic_ie = GenericIE()
2199
2200                 # File downloader
2201                 fd = FileDownloader({
2202                         'usenetrc': opts.usenetrc,
2203                         'username': opts.username,
2204                         'password': opts.password,
2205                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2206                         'forceurl': opts.geturl,
2207                         'forcetitle': opts.gettitle,
2208                         'forcethumbnail': opts.getthumbnail,
2209                         'forcedescription': opts.getdescription,
2210                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2211                         'format': opts.format,
2212                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2213                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2214                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2215                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2216                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2217                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2218                                 or u'%(id)s.%(ext)s'),
2219                         'ignoreerrors': opts.ignoreerrors,
2220                         'ratelimit': opts.ratelimit,
2221                         'nooverwrites': opts.nooverwrites,
2222                         'retries': opts.retries,
2223                         'continuedl': opts.continue_dl,
2224                         'noprogress': opts.noprogress,
2225                         })
2226                 fd.add_info_extractor(youtube_search_ie)
2227                 fd.add_info_extractor(youtube_pl_ie)
2228                 fd.add_info_extractor(youtube_user_ie)
2229                 fd.add_info_extractor(metacafe_ie)
2230                 fd.add_info_extractor(dailymotion_ie)
2231                 fd.add_info_extractor(youtube_ie)
2232                 fd.add_info_extractor(google_ie)
2233                 fd.add_info_extractor(google_search_ie)
2234                 fd.add_info_extractor(photobucket_ie)
2235                 fd.add_info_extractor(yahoo_ie)
2236                 fd.add_info_extractor(yahoo_search_ie)
2237
2238                 # This must come last since it's the
2239                 # fallback if none of the others work
2240                 fd.add_info_extractor(generic_ie)
2241
2242                 # Update version
2243                 if opts.update_self:
2244                         update_self(fd, sys.argv[0])
2245
2246                 # Maybe do nothing
2247                 if len(all_urls) < 1:
2248                         if not opts.update_self:
2249                                 parser.error(u'you must provide at least one URL')
2250                         else:
2251                                 sys.exit()
2252                 retcode = fd.download(all_urls)
2253                 sys.exit(retcode)
2254
2255         except DownloadError:
2256                 sys.exit(1)
2257         except SameFileError:
2258                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2259         except KeyboardInterrupt:
2260                 sys.exit(u'\nERROR: Interrupted by user')