_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 stream = open(filename, open_mode)
  97                 return (stream, filename)
  98         except (IOError, OSError), err:
  99                 # In case of error, try to remove win32 forbidden chars
 100                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 101
 102                 # An exception here should be caught in the caller
 103                 stream = open(filename, open_mode)
 104                 return (stream, filename)
 105
 106
 107 class DownloadError(Exception):
 108         """Download Error exception.
 109
 110         This exception may be thrown by FileDownloader objects if they are not
 111         configured to continue on errors. They will contain the appropriate
 112         error message.
 113         """
 114         pass
 115
 116 class SameFileError(Exception):
 117         """Same File exception.
 118
 119         This exception will be thrown by FileDownloader objects if they detect
 120         multiple files would have to be downloaded to the same file on disk.
 121         """
 122         pass
 123
 124 class PostProcessingError(Exception):
 125         """Post Processing exception.
 126
 127         This exception may be raised by PostProcessor's .run() method to
 128         indicate an error in the postprocessing task.
 129         """
 130         pass
 131
 132 class UnavailableFormatError(Exception):
 133         """Unavailable Format exception.
 134
 135         This exception will be thrown when a video is requested
 136         in a format that is not available for that video.
 137         """
 138         pass
 139
 140 class ContentTooShortError(Exception):
 141         """Content Too Short exception.
 142
 143         This exception may be raised by FileDownloader objects when a file they
 144         download is too small for what the server announced first, indicating
 145         the connection was probably interrupted.
 146         """
 147         # Both in bytes
 148         downloaded = None
 149         expected = None
 150
 151         def __init__(self, downloaded, expected):
 152                 self.downloaded = downloaded
 153                 self.expected = expected
 154
 155 class FileDownloader(object):
 156         """File Downloader class.
 157
 158         File downloader objects are the ones responsible of downloading the
 159         actual video file and writing it to disk if the user has requested
 160         it, among some other tasks. In most cases there should be one per
 161         program. As, given a video URL, the downloader doesn't know how to
 162         extract all the needed information, task that InfoExtractors do, it
 163         has to pass the URL to one of them.
 164
 165         For this, file downloader objects have a method that allows
 166         InfoExtractors to be registered in a given order. When it is passed
 167         a URL, the file downloader handles it to the first InfoExtractor it
 168         finds that reports being able to handle it. The InfoExtractor extracts
 169         all the information about the video or videos the URL refers to, and
 170         asks the FileDownloader to process the video information, possibly
 171         downloading the video.
 172
 173         File downloaders accept a lot of parameters. In order not to saturate
 174         the object constructor with arguments, it receives a dictionary of
 175         options instead. These options are available through the params
 176         attribute for the InfoExtractors to use. The FileDownloader also
 177         registers itself as the downloader in charge for the InfoExtractors
 178         that are added to it, so this is a "mutual registration".
 179
 180         Available options:
 181
 182         username:       Username for authentication purposes.
 183         password:       Password for authentication purposes.
 184         usenetrc:       Use netrc for authentication instead.
 185         quiet:          Do not print messages to stdout.
 186         forceurl:       Force printing final URL.
 187         forcetitle:     Force printing title.
 188         simulate:       Do not download the video files.
 189         format:         Video format code.
 190         outtmpl:        Template for output names.
 191         ignoreerrors:   Do not stop on download errors.
 192         ratelimit:      Download speed limit, in bytes/sec.
 193         nooverwrites:   Prevent overwriting files.
 194         continuedl:     Try to continue downloads if possible.
 195         """
 196
 197         params = None
 198         _ies = []
 199         _pps = []
 200         _download_retcode = None
 201
 202         def __init__(self, params):
 203                 """Create a FileDownloader object with the given options."""
 204                 self._ies = []
 205                 self._pps = []
 206                 self._download_retcode = 0
 207                 self.params = params
 208
 209         @staticmethod
 210         def pmkdir(filename):
 211                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 212                 components = filename.split(os.sep)
 213                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 214                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 215                 for dir in aggregate:
 216                         if not os.path.exists(dir):
 217                                 os.mkdir(dir)
 218
 219         @staticmethod
 220         def format_bytes(bytes):
 221                 if bytes is None:
 222                         return 'N/A'
 223                 if type(bytes) is str:
 224                         bytes = float(bytes)
 225                 if bytes == 0.0:
 226                         exponent = 0
 227                 else:
 228                         exponent = long(math.log(bytes, 1024.0))
 229                 suffix = 'bkMGTPEZY'[exponent]
 230                 converted = float(bytes) / float(1024**exponent)
 231                 return '%.2f%s' % (converted, suffix)
 232
 233         @staticmethod
 234         def calc_percent(byte_counter, data_len):
 235                 if data_len is None:
 236                         return '---.-%'
 237                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 238
 239         @staticmethod
 240         def calc_eta(start, now, total, current):
 241                 if total is None:
 242                         return '--:--'
 243                 dif = now - start
 244                 if current == 0 or dif < 0.001: # One millisecond
 245                         return '--:--'
 246                 rate = float(current) / dif
 247                 eta = long((float(total) - float(current)) / rate)
 248                 (eta_mins, eta_secs) = divmod(eta, 60)
 249                 if eta_mins > 99:
 250                         return '--:--'
 251                 return '%02d:%02d' % (eta_mins, eta_secs)
 252
 253         @staticmethod
 254         def calc_speed(start, now, bytes):
 255                 dif = now - start
 256                 if bytes == 0 or dif < 0.001: # One millisecond
 257                         return '%10s' % '---b/s'
 258                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 259
 260         @staticmethod
 261         def best_block_size(elapsed_time, bytes):
 262                 new_min = max(bytes / 2.0, 1.0)
 263                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 264                 if elapsed_time < 0.001:
 265                         return long(new_max)
 266                 rate = bytes / elapsed_time
 267                 if rate > new_max:
 268                         return long(new_max)
 269                 if rate < new_min:
 270                         return long(new_min)
 271                 return long(rate)
 272
 273         @staticmethod
 274         def parse_bytes(bytestr):
 275                 """Parse a string indicating a byte quantity into a long integer."""
 276                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 277                 if matchobj is None:
 278                         return None
 279                 number = float(matchobj.group(1))
 280                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 281                 return long(round(number * multiplier))
 282
 283         @staticmethod
 284         def verify_url(url):
 285                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 286                 request = urllib2.Request(url, None, std_headers)
 287                 data = urllib2.urlopen(request)
 288                 data.read(1)
 289                 url = data.geturl()
 290                 data.close()
 291                 return url
 292
 293         def add_info_extractor(self, ie):
 294                 """Add an InfoExtractor object to the end of the list."""
 295                 self._ies.append(ie)
 296                 ie.set_downloader(self)
 297
 298         def add_post_processor(self, pp):
 299                 """Add a PostProcessor object to the end of the chain."""
 300                 self._pps.append(pp)
 301                 pp.set_downloader(self)
 302
 303         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 304                 """Print message to stdout if not in quiet mode."""
 305                 try:
 306                         if not self.params.get('quiet', False):
 307                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 308                         sys.stdout.flush()
 309                 except (UnicodeEncodeError), err:
 310                         if not ignore_encoding_errors:
 311                                 raise
 312
 313         def to_stderr(self, message):
 314                 """Print message to stderr."""
 315                 print >>sys.stderr, message.encode(preferredencoding())
 316
 317         def fixed_template(self):
 318                 """Checks if the output template is fixed."""
 319                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 320
 321         def trouble(self, message=None):
 322                 """Determine action to take when a download problem appears.
 323
 324                 Depending on if the downloader has been configured to ignore
 325                 download errors or not, this method may throw an exception or
 326                 not when errors are found, after printing the message.
 327                 """
 328                 if message is not None:
 329                         self.to_stderr(message)
 330                 if not self.params.get('ignoreerrors', False):
 331                         raise DownloadError(message)
 332                 self._download_retcode = 1
 333
 334         def slow_down(self, start_time, byte_counter):
 335                 """Sleep if the download speed is over the rate limit."""
 336                 rate_limit = self.params.get('ratelimit', None)
 337                 if rate_limit is None or byte_counter == 0:
 338                         return
 339                 now = time.time()
 340                 elapsed = now - start_time
 341                 if elapsed <= 0.0:
 342                         return
 343                 speed = float(byte_counter) / elapsed
 344                 if speed > rate_limit:
 345                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 346
 347         def report_destination(self, filename):
 348                 """Report destination filename."""
 349                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 350
 351         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 352                 """Report download progress."""
 353                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 354                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 355
 356         def report_resuming_byte(self, resume_len):
 357                 """Report attemtp to resume at given byte."""
 358                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 359
 360         def report_file_already_downloaded(self, file_name):
 361                 """Report file has already been fully downloaded."""
 362                 try:
 363                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 364                 except (UnicodeEncodeError), err:
 365                         self.to_stdout(u'[download] The file has already been downloaded')
 366
 367         def report_unable_to_resume(self):
 368                 """Report it was impossible to resume download."""
 369                 self.to_stdout(u'[download] Unable to resume')
 370
 371         def report_finish(self):
 372                 """Report download finished."""
 373                 self.to_stdout(u'')
 374
 375         def process_info(self, info_dict):
 376                 """Process a single dictionary returned by an InfoExtractor."""
 377                 # Do nothing else if in simulate mode
 378                 if self.params.get('simulate', False):
 379                         # Verify URL if it's an HTTP one
 380                         if info_dict['url'].startswith('http'):
 381                                 try:
 382                                         info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 383                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 384                                         raise UnavailableFormatError
 385
 386                         # Forced printings
 387                         if self.params.get('forcetitle', False):
 388                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 389                         if self.params.get('forceurl', False):
 390                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 391
 392                         return
 393
 394                 try:
 395                         template_dict = dict(info_dict)
 396                         template_dict['epoch'] = unicode(long(time.time()))
 397                         filename = self.params['outtmpl'] % template_dict
 398                 except (ValueError, KeyError), err:
 399                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 400                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 401                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 402                         return
 403
 404                 try:
 405                         self.pmkdir(filename)
 406                 except (OSError, IOError), err:
 407                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 408                         return
 409
 410                 try:
 411                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 412                 except (OSError, IOError), err:
 413                         raise UnavailableFormatError
 414                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 415                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 416                         return
 417                 except (ContentTooShortError, ), err:
 418                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 419                         return
 420
 421                 if success:
 422                         try:
 423                                 self.post_process(filename, info_dict)
 424                         except (PostProcessingError), err:
 425                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 426                                 return
 427
 428         def download(self, url_list):
 429                 """Download a given list of URLs."""
 430                 if len(url_list) > 1 and self.fixed_template():
 431                         raise SameFileError(self.params['outtmpl'])
 432
 433                 for url in url_list:
 434                         suitable_found = False
 435                         for ie in self._ies:
 436                                 # Go to next InfoExtractor if not suitable
 437                                 if not ie.suitable(url):
 438                                         continue
 439
 440                                 # Suitable InfoExtractor found
 441                                 suitable_found = True
 442
 443                                 # Extract information from URL and process it
 444                                 ie.extract(url)
 445
 446                                 # Suitable InfoExtractor had been found; go to next URL
 447                                 break
 448
 449                         if not suitable_found:
 450                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 451
 452                 return self._download_retcode
 453
 454         def post_process(self, filename, ie_info):
 455                 """Run the postprocessing chain on the given file."""
 456                 info = dict(ie_info)
 457                 info['filepath'] = filename
 458                 for pp in self._pps:
 459                         info = pp.run(info)
 460                         if info is None:
 461                                 break
 462
 463         def _download_with_rtmpdump(self, filename, url):
 464                 self.report_destination(filename)
 465
 466                 # Check for rtmpdump first
 467                 try:
 468                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 469                 except (OSError, IOError):
 470                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 471                         return False
 472
 473                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 474                 # the connection was interrumpted and resuming appears to be
 475                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 476                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 477                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 478                 while retval == 2 or retval == 1:
 479                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 480                         time.sleep(2.0) # This seems to be needed
 481                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 482                 if retval == 0:
 483                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 484                         return True
 485                 else:
 486                         self.trouble('ERROR: rtmpdump exited with code %d' % retval)
 487                         return False
 488
 489         def _do_download(self, filename, url):
 490                 # Attempt to download using rtmpdump
 491                 if url.startswith('rtmp'):
 492                         return self._download_with_rtmpdump(filename, url)
 493
 494                 stream = None
 495                 open_mode = 'wb'
 496                 basic_request = urllib2.Request(url, None, std_headers)
 497                 request = urllib2.Request(url, None, std_headers)
 498
 499                 # Establish possible resume length
 500                 if os.path.isfile(filename):
 501                         resume_len = os.path.getsize(filename)
 502                 else:
 503                         resume_len = 0
 504
 505                 # Request parameters in case of being able to resume
 506                 if self.params.get('continuedl', False) and resume_len != 0:
 507                         self.report_resuming_byte(resume_len)
 508                         request.add_header('Range','bytes=%d-' % resume_len)
 509                         open_mode = 'ab'
 510
 511                 # Establish connection
 512                 try:
 513                         data = urllib2.urlopen(request)
 514                 except (urllib2.HTTPError, ), err:
 515                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 516                                 raise
 517                         # Unable to resume
 518                         data = urllib2.urlopen(basic_request)
 519                         content_length = data.info()['Content-Length']
 520
 521                         if content_length is not None and long(content_length) == resume_len:
 522                                 # Because the file had already been fully downloaded
 523                                 self.report_file_already_downloaded(filename)
 524                                 return True
 525                         else:
 526                                 # Because the server didn't let us
 527                                 self.report_unable_to_resume()
 528                                 open_mode = 'wb'
 529
 530                 data_len = data.info().get('Content-length', None)
 531                 data_len_str = self.format_bytes(data_len)
 532                 byte_counter = 0
 533                 block_size = 1024
 534                 start = time.time()
 535                 while True:
 536                         # Download and write
 537                         before = time.time()
 538                         data_block = data.read(block_size)
 539                         after = time.time()
 540                         data_block_len = len(data_block)
 541                         if data_block_len == 0:
 542                                 break
 543                         byte_counter += data_block_len
 544
 545                         # Open file just in time
 546                         if stream is None:
 547                                 try:
 548                                         (stream, filename) = sanitize_open(filename, open_mode)
 549                                         self.report_destination(filename)
 550                                 except (OSError, IOError), err:
 551                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 552                                         return False
 553                         stream.write(data_block)
 554                         block_size = self.best_block_size(after - before, data_block_len)
 555
 556                         # Progress message
 557                         percent_str = self.calc_percent(byte_counter, data_len)
 558                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 559                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 560                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 561
 562                         # Apply rate limit
 563                         self.slow_down(start, byte_counter)
 564
 565                 self.report_finish()
 566                 if data_len is not None and str(byte_counter) != data_len:
 567                         raise ContentTooShortError(byte_counter, long(data_len))
 568                 return True
 569
 570 class InfoExtractor(object):
 571         """Information Extractor class.
 572
 573         Information extractors are the classes that, given a URL, extract
 574         information from the video (or videos) the URL refers to. This
 575         information includes the real video URL, the video title and simplified
 576         title, author and others. The information is stored in a dictionary
 577         which is then passed to the FileDownloader. The FileDownloader
 578         processes this information possibly downloading the video to the file
 579         system, among other possible outcomes. The dictionaries must include
 580         the following fields:
 581
 582         id:             Video identifier.
 583         url:            Final video URL.
 584         uploader:       Nickname of the video uploader.
 585         title:          Literal title.
 586         stitle:         Simplified title.
 587         ext:            Video filename extension.
 588
 589         Subclasses of this one should re-define the _real_initialize() and
 590         _real_extract() methods, as well as the suitable() static method.
 591         Probably, they should also be instantiated and added to the main
 592         downloader.
 593         """
 594
 595         _ready = False
 596         _downloader = None
 597
 598         def __init__(self, downloader=None):
 599                 """Constructor. Receives an optional downloader."""
 600                 self._ready = False
 601                 self.set_downloader(downloader)
 602
 603         @staticmethod
 604         def suitable(url):
 605                 """Receives a URL and returns True if suitable for this IE."""
 606                 return False
 607
 608         def initialize(self):
 609                 """Initializes an instance (authentication, etc)."""
 610                 if not self._ready:
 611                         self._real_initialize()
 612                         self._ready = True
 613
 614         def extract(self, url):
 615                 """Extracts URL information and returns it in list of dicts."""
 616                 self.initialize()
 617                 return self._real_extract(url)
 618
 619         def set_downloader(self, downloader):
 620                 """Sets the downloader for this IE."""
 621                 self._downloader = downloader
 622
 623         def _real_initialize(self):
 624                 """Real initialization process. Redefine in subclasses."""
 625                 pass
 626
 627         def _real_extract(self, url):
 628                 """Real extraction process. Redefine in subclasses."""
 629                 pass
 630
 631 class YoutubeIE(InfoExtractor):
 632         """Information extractor for youtube.com."""
 633
 634         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 635         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 636         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 637         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 638         _NETRC_MACHINE = 'youtube'
 639         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 640         _video_extensions = {
 641                 '13': '3gp',
 642                 '17': 'mp4',
 643                 '18': 'mp4',
 644                 '22': 'mp4',
 645                 '37': 'mp4',
 646         }
 647
 648         @staticmethod
 649         def suitable(url):
 650                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 651
 652         def report_lang(self):
 653                 """Report attempt to set language."""
 654                 self._downloader.to_stdout(u'[youtube] Setting language')
 655
 656         def report_login(self):
 657                 """Report attempt to log in."""
 658                 self._downloader.to_stdout(u'[youtube] Logging in')
 659
 660         def report_age_confirmation(self):
 661                 """Report attempt to confirm age."""
 662                 self._downloader.to_stdout(u'[youtube] Confirming age')
 663
 664         def report_video_info_webpage_download(self, video_id):
 665                 """Report attempt to download video info webpage."""
 666                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 667
 668         def report_information_extraction(self, video_id):
 669                 """Report attempt to extract video information."""
 670                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 671
 672         def report_unavailable_format(self, video_id, format):
 673                 """Report extracted video URL."""
 674                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 675
 676         def report_rtmp_download(self):
 677                 """Indicate the download will use the RTMP protocol."""
 678                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 679
 680         def _real_initialize(self):
 681                 if self._downloader is None:
 682                         return
 683
 684                 username = None
 685                 password = None
 686                 downloader_params = self._downloader.params
 687
 688                 # Attempt to use provided username and password or .netrc data
 689                 if downloader_params.get('username', None) is not None:
 690                         username = downloader_params['username']
 691                         password = downloader_params['password']
 692                 elif downloader_params.get('usenetrc', False):
 693                         try:
 694                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 695                                 if info is not None:
 696                                         username = info[0]
 697                                         password = info[2]
 698                                 else:
 699                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 700                         except (IOError, netrc.NetrcParseError), err:
 701                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 702                                 return
 703
 704                 # Set language
 705                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 706                 try:
 707                         self.report_lang()
 708                         urllib2.urlopen(request).read()
 709                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 710                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 711                         return
 712
 713                 # No authentication to be performed
 714                 if username is None:
 715                         return
 716
 717                 # Log in
 718                 login_form = {
 719                                 'current_form': 'loginForm',
 720                                 'next':         '/',
 721                                 'action_login': 'Log In',
 722                                 'username':     username,
 723                                 'password':     password,
 724                                 }
 725                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 726                 try:
 727                         self.report_login()
 728                         login_results = urllib2.urlopen(request).read()
 729                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 730                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 731                                 return
 732                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 733                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 734                         return
 735
 736                 # Confirm age
 737                 age_form = {
 738                                 'next_url':             '/',
 739                                 'action_confirm':       'Confirm',
 740                                 }
 741                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 742                 try:
 743                         self.report_age_confirmation()
 744                         age_results = urllib2.urlopen(request).read()
 745                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 746                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 747                         return
 748
 749         def _real_extract(self, url):
 750                 # Extract video id from URL
 751                 mobj = re.match(self._VALID_URL, url)
 752                 if mobj is None:
 753                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 754                         return
 755                 video_id = mobj.group(2)
 756
 757                 # Downloader parameters
 758                 best_quality = False
 759                 format_param = None
 760                 quality_index = 0
 761                 if self._downloader is not None:
 762                         params = self._downloader.params
 763                         format_param = params.get('format', None)
 764                         if format_param == '0':
 765                                 format_param = self._available_formats[quality_index]
 766                                 best_quality = True
 767
 768                 while True:
 769                         # Extension
 770                         video_extension = self._video_extensions.get(format_param, 'flv')
 771
 772                         # Get video info
 773                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
 774                         request = urllib2.Request(video_info_url, None, std_headers)
 775                         try:
 776                                 self.report_video_info_webpage_download(video_id)
 777                                 video_info_webpage = urllib2.urlopen(request).read()
 778                                 video_info = parse_qs(video_info_webpage)
 779                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 780                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 781                                 return
 782                         self.report_information_extraction(video_id)
 783
 784                         # "t" param
 785                         if 'token' not in video_info:
 786                                 # Attempt to see if YouTube has issued an error message
 787                                 if 'reason' not in video_info:
 788                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 789                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 790                                         stream.write(video_info_webpage)
 791                                         stream.close()
 792                                 else:
 793                                         reason = urllib.unquote_plus(video_info['reason'][0])
 794                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 795                                 return
 796                         token = urllib.unquote_plus(video_info['token'][0])
 797                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 798                         if format_param is not None:
 799                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 800
 801                         # Check possible RTMP download
 802                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 803                                 self.report_rtmp_download()
 804                                 video_real_url = video_info['conn'][0]
 805
 806                         # uploader
 807                         if 'author' not in video_info:
 808                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 809                                 return
 810                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 811
 812                         # title
 813                         if 'title' not in video_info:
 814                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 815                                 return
 816                         video_title = urllib.unquote_plus(video_info['title'][0])
 817                         video_title = video_title.decode('utf-8')
 818                         video_title = sanitize_title(video_title)
 819
 820                         # simplified title
 821                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 822                         simple_title = simple_title.strip(ur'_')
 823
 824                         try:
 825                                 # Process video information
 826                                 self._downloader.process_info({
 827                                         'id':           video_id.decode('utf-8'),
 828                                         'url':          video_real_url.decode('utf-8'),
 829                                         'uploader':     video_uploader.decode('utf-8'),
 830                                         'title':        video_title,
 831                                         'stitle':       simple_title,
 832                                         'ext':          video_extension.decode('utf-8'),
 833                                 })
 834
 835                                 return
 836
 837                         except UnavailableFormatError, err:
 838                                 if best_quality:
 839                                         if quality_index == len(self._available_formats) - 1:
 840                                                 # I don't ever expect this to happen
 841                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 842                                                 return
 843                                         else:
 844                                                 self.report_unavailable_format(video_id, format_param)
 845                                                 quality_index += 1
 846                                                 format_param = self._available_formats[quality_index]
 847                                                 continue
 848                                 else:
 849                                         self._downloader.trouble('ERROR: format not available for video')
 850                                         return
 851
 852
 853 class MetacafeIE(InfoExtractor):
 854         """Information Extractor for metacafe.com."""
 855
 856         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 857         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 858         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 859         _youtube_ie = None
 860
 861         def __init__(self, youtube_ie, downloader=None):
 862                 InfoExtractor.__init__(self, downloader)
 863                 self._youtube_ie = youtube_ie
 864
 865         @staticmethod
 866         def suitable(url):
 867                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 868
 869         def report_disclaimer(self):
 870                 """Report disclaimer retrieval."""
 871                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 872
 873         def report_age_confirmation(self):
 874                 """Report attempt to confirm age."""
 875                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 876
 877         def report_download_webpage(self, video_id):
 878                 """Report webpage download."""
 879                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 880
 881         def report_extraction(self, video_id):
 882                 """Report information extraction."""
 883                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 884
 885         def _real_initialize(self):
 886                 # Retrieve disclaimer
 887                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 888                 try:
 889                         self.report_disclaimer()
 890                         disclaimer = urllib2.urlopen(request).read()
 891                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 892                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 893                         return
 894
 895                 # Confirm age
 896                 disclaimer_form = {
 897                         'filters': '0',
 898                         'submit': "Continue - I'm over 18",
 899                         }
 900                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 901                 try:
 902                         self.report_age_confirmation()
 903                         disclaimer = urllib2.urlopen(request).read()
 904                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 905                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 906                         return
 907
 908         def _real_extract(self, url):
 909                 # Extract id and simplified title from URL
 910                 mobj = re.match(self._VALID_URL, url)
 911                 if mobj is None:
 912                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 913                         return
 914
 915                 video_id = mobj.group(1)
 916
 917                 # Check if video comes from YouTube
 918                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 919                 if mobj2 is not None:
 920                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 921                         return
 922
 923                 simple_title = mobj.group(2).decode('utf-8')
 924                 video_extension = 'flv'
 925
 926                 # Retrieve video webpage to extract further information
 927                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 928                 try:
 929                         self.report_download_webpage(video_id)
 930                         webpage = urllib2.urlopen(request).read()
 931                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 932                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 933                         return
 934
 935                 # Extract URL, uploader and title from webpage
 936                 self.report_extraction(video_id)
 937                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 938                 if mobj is None:
 939                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 940                         return
 941                 mediaURL = urllib.unquote(mobj.group(1))
 942
 943                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 944                 #if mobj is None:
 945                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 946                 #       return
 947                 #gdaKey = mobj.group(1)
 948                 #
 949                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 950
 951                 video_url = mediaURL
 952
 953                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 954                 if mobj is None:
 955                         self._downloader.trouble(u'ERROR: unable to extract title')
 956                         return
 957                 video_title = mobj.group(1).decode('utf-8')
 958                 video_title = sanitize_title(video_title)
 959
 960                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 961                 if mobj is None:
 962                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 963                         return
 964                 video_uploader = mobj.group(1)
 965
 966                 try:
 967                         # Process video information
 968                         self._downloader.process_info({
 969                                 'id':           video_id.decode('utf-8'),
 970                                 'url':          video_url.decode('utf-8'),
 971                                 'uploader':     video_uploader.decode('utf-8'),
 972                                 'title':        video_title,
 973                                 'stitle':       simple_title,
 974                                 'ext':          video_extension.decode('utf-8'),
 975                         })
 976                 except UnavailableFormatError:
 977                         self._downloader.trouble(u'ERROR: format not available for video')
 978
 979
 980 class GoogleIE(InfoExtractor):
 981         """Information extractor for video.google.com."""
 982
 983         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 984
 985         def __init__(self, downloader=None):
 986                 InfoExtractor.__init__(self, downloader)
 987
 988         @staticmethod
 989         def suitable(url):
 990                 return (re.match(GoogleIE._VALID_URL, url) is not None)
 991
 992         def report_download_webpage(self, video_id):
 993                 """Report webpage download."""
 994                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
 995
 996         def report_extraction(self, video_id):
 997                 """Report information extraction."""
 998                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
 999
1000         def _real_initialize(self):
1001                 return
1002
1003         def _real_extract(self, url):
1004                 # Extract id from URL
1005                 mobj = re.match(self._VALID_URL, url)
1006                 if mobj is None:
1007                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1008                         return
1009
1010                 video_id = mobj.group(1)
1011
1012                 video_extension = 'mp4'
1013
1014                 # Retrieve video webpage to extract further information
1015                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1016                 try:
1017                         self.report_download_webpage(video_id)
1018                         webpage = urllib2.urlopen(request).read()
1019                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1021                         return
1022
1023                 # Extract URL, uploader, and title from webpage
1024                 self.report_extraction(video_id)
1025                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1026                 if mobj is None:
1027                         video_extension = 'flv'
1028                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1029                 if mobj is None:
1030                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1031                         return
1032                 mediaURL = urllib.unquote(mobj.group(1))
1033                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1034                 mediaURL = mediaURL.replace('\\x26', '\x26')
1035
1036                 video_url = mediaURL
1037
1038                 mobj = re.search(r'<title>(.*)</title>', webpage)
1039                 if mobj is None:
1040                         self._downloader.trouble(u'ERROR: unable to extract title')
1041                         return
1042                 video_title = mobj.group(1).decode('utf-8')
1043                 video_title = sanitize_title(video_title)
1044                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1045
1046                 # Google Video doesn't show uploader nicknames?
1047                 video_uploader = 'NA'
1048
1049                 try:
1050                         # Process video information
1051                         self._downloader.process_info({
1052                                 'id':           video_id.decode('utf-8'),
1053                                 'url':          video_url.decode('utf-8'),
1054                                 'uploader':     video_uploader.decode('utf-8'),
1055                                 'title':        video_title,
1056                                 'stitle':       simple_title,
1057                                 'ext':          video_extension.decode('utf-8'),
1058                         })
1059                 except UnavailableFormatError:
1060                         self._downloader.trouble(u'ERROR: format not available for video')
1061
1062
1063 class PhotobucketIE(InfoExtractor):
1064         """Information extractor for photobucket.com."""
1065
1066         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1067
1068         def __init__(self, downloader=None):
1069                 InfoExtractor.__init__(self, downloader)
1070
1071         @staticmethod
1072         def suitable(url):
1073                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1074
1075         def report_download_webpage(self, video_id):
1076                 """Report webpage download."""
1077                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1078
1079         def report_extraction(self, video_id):
1080                 """Report information extraction."""
1081                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1082
1083         def _real_initialize(self):
1084                 return
1085
1086         def _real_extract(self, url):
1087                 # Extract id from URL
1088                 mobj = re.match(self._VALID_URL, url)
1089                 if mobj is None:
1090                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1091                         return
1092
1093                 video_id = mobj.group(1)
1094
1095                 video_extension = 'flv'
1096
1097                 # Retrieve video webpage to extract further information
1098                 request = urllib2.Request(url)
1099                 try:
1100                         self.report_download_webpage(video_id)
1101                         webpage = urllib2.urlopen(request).read()
1102                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1103                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1104                         return
1105
1106                 # Extract URL, uploader, and title from webpage
1107                 self.report_extraction(video_id)
1108                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1109                 if mobj is None:
1110                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1111                         return
1112                 mediaURL = urllib.unquote(mobj.group(1))
1113
1114                 video_url = mediaURL
1115
1116                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1117                 if mobj is None:
1118                         self._downloader.trouble(u'ERROR: unable to extract title')
1119                         return
1120                 video_title = mobj.group(1).decode('utf-8')
1121                 video_title = sanitize_title(video_title)
1122                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1123
1124                 video_uploader = mobj.group(2).decode('utf-8')
1125
1126                 try:
1127                         # Process video information
1128                         self._downloader.process_info({
1129                                 'id':           video_id.decode('utf-8'),
1130                                 'url':          video_url.decode('utf-8'),
1131                                 'uploader':     video_uploader,
1132                                 'title':        video_title,
1133                                 'stitle':       simple_title,
1134                                 'ext':          video_extension.decode('utf-8'),
1135                         })
1136                 except UnavailableFormatError:
1137                         self._downloader.trouble(u'ERROR: format not available for video')
1138
1139
1140 class GenericIE(InfoExtractor):
1141         """Generic last-resort information extractor."""
1142
1143         def __init__(self, downloader=None):
1144                 InfoExtractor.__init__(self, downloader)
1145
1146         @staticmethod
1147         def suitable(url):
1148                 return True
1149
1150         def report_download_webpage(self, video_id):
1151                 """Report webpage download."""
1152                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1153                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1154
1155         def report_extraction(self, video_id):
1156                 """Report information extraction."""
1157                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1158
1159         def _real_initialize(self):
1160                 return
1161
1162         def _real_extract(self, url):
1163                 video_id = url.split('/')[-1]
1164                 request = urllib2.Request(url)
1165                 try:
1166                         self.report_download_webpage(video_id)
1167                         webpage = urllib2.urlopen(request).read()
1168                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1169                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1170                         return
1171                 except ValueError, err:
1172                         # since this is the last-resort InfoExtractor, if
1173                         # this error is thrown, it'll be thrown here
1174                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1175                         return
1176
1177                 # Start with something easy: JW Player in SWFObject
1178                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1179                 if mobj is None:
1180                         # Broaden the search a little bit
1181                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1182                 if mobj is None:
1183                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184                         return
1185
1186                 # It's possible that one of the regexes
1187                 # matched, but returned an empty group:
1188                 if mobj.group(1) is None:
1189                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190                         return
1191
1192                 video_url = urllib.unquote(mobj.group(1))
1193                 video_id  = os.path.basename(video_url)
1194
1195                 # here's a fun little line of code for you:
1196                 video_extension = os.path.splitext(video_id)[1][1:]
1197                 video_id        = os.path.splitext(video_id)[0]
1198
1199                 # it's tempting to parse this further, but you would
1200                 # have to take into account all the variations like
1201                 #   Video Title - Site Name
1202                 #   Site Name | Video Title
1203                 #   Video Title - Tagline | Site Name
1204                 # and so on and so forth; it's just not practical
1205                 mobj = re.search(r'<title>(.*)</title>', webpage)
1206                 if mobj is None:
1207                         self._downloader.trouble(u'ERROR: unable to extract title')
1208                         return
1209                 video_title = mobj.group(1).decode('utf-8')
1210                 video_title = sanitize_title(video_title)
1211                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1212
1213                 # video uploader is domain name
1214                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1215                 if mobj is None:
1216                         self._downloader.trouble(u'ERROR: unable to extract title')
1217                         return
1218                 video_uploader = mobj.group(1).decode('utf-8')
1219
1220                 try:
1221                         # Process video information
1222                         self._downloader.process_info({
1223                                 'id':           video_id.decode('utf-8'),
1224                                 'url':          video_url.decode('utf-8'),
1225                                 'uploader':     video_uploader,
1226                                 'title':        video_title,
1227                                 'stitle':       simple_title,
1228                                 'ext':          video_extension.decode('utf-8'),
1229                         })
1230                 except UnavailableFormatError:
1231                         self._downloader.trouble(u'ERROR: format not available for video')
1232
1233
1234 class YoutubeSearchIE(InfoExtractor):
1235         """Information Extractor for YouTube search queries."""
1236         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1237         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1238         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1239         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1240         _youtube_ie = None
1241         _max_youtube_results = 1000
1242
1243         def __init__(self, youtube_ie, downloader=None):
1244                 InfoExtractor.__init__(self, downloader)
1245                 self._youtube_ie = youtube_ie
1246
1247         @staticmethod
1248         def suitable(url):
1249                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1250
1251         def report_download_page(self, query, pagenum):
1252                 """Report attempt to download playlist page with given number."""
1253                 query = query.decode(preferredencoding())
1254                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1255
1256         def _real_initialize(self):
1257                 self._youtube_ie.initialize()
1258
1259         def _real_extract(self, query):
1260                 mobj = re.match(self._VALID_QUERY, query)
1261                 if mobj is None:
1262                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1263                         return
1264
1265                 prefix, query = query.split(':')
1266                 prefix = prefix[8:]
1267                 query  = query.encode('utf-8')
1268                 if prefix == '':
1269                         self._download_n_results(query, 1)
1270                         return
1271                 elif prefix == 'all':
1272                         self._download_n_results(query, self._max_youtube_results)
1273                         return
1274                 else:
1275                         try:
1276                                 n = long(prefix)
1277                                 if n <= 0:
1278                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1279                                         return
1280                                 elif n > self._max_youtube_results:
1281                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1282                                         n = self._max_youtube_results
1283                                 self._download_n_results(query, n)
1284                                 return
1285                         except ValueError: # parsing prefix as integer fails
1286                                 self._download_n_results(query, 1)
1287                                 return
1288
1289         def _download_n_results(self, query, n):
1290                 """Downloads a specified number of results for a query"""
1291
1292                 video_ids = []
1293                 already_seen = set()
1294                 pagenum = 1
1295
1296                 while True:
1297                         self.report_download_page(query, pagenum)
1298                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1299                         request = urllib2.Request(result_url, None, std_headers)
1300                         try:
1301                                 page = urllib2.urlopen(request).read()
1302                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1303                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1304                                 return
1305
1306                         # Extract video identifiers
1307                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1308                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1309                                 if video_id not in already_seen:
1310                                         video_ids.append(video_id)
1311                                         already_seen.add(video_id)
1312                                         if len(video_ids) == n:
1313                                                 # Specified n videos reached
1314                                                 for id in video_ids:
1315                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1316                                                 return
1317
1318                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1319                                 for id in video_ids:
1320                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1321                                 return
1322
1323                         pagenum = pagenum + 1
1324
1325 class YoutubePlaylistIE(InfoExtractor):
1326         """Information Extractor for YouTube playlists."""
1327
1328         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1329         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1330         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1331         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1332         _youtube_ie = None
1333
1334         def __init__(self, youtube_ie, downloader=None):
1335                 InfoExtractor.__init__(self, downloader)
1336                 self._youtube_ie = youtube_ie
1337
1338         @staticmethod
1339         def suitable(url):
1340                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1341
1342         def report_download_page(self, playlist_id, pagenum):
1343                 """Report attempt to download playlist page with given number."""
1344                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1345
1346         def _real_initialize(self):
1347                 self._youtube_ie.initialize()
1348
1349         def _real_extract(self, url):
1350                 # Extract playlist id
1351                 mobj = re.match(self._VALID_URL, url)
1352                 if mobj is None:
1353                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1354                         return
1355
1356                 # Download playlist pages
1357                 playlist_id = mobj.group(1)
1358                 video_ids = []
1359                 pagenum = 1
1360
1361                 while True:
1362                         self.report_download_page(playlist_id, pagenum)
1363                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1364                         try:
1365                                 page = urllib2.urlopen(request).read()
1366                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1367                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1368                                 return
1369
1370                         # Extract video identifiers
1371                         ids_in_page = []
1372                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1373                                 if mobj.group(1) not in ids_in_page:
1374                                         ids_in_page.append(mobj.group(1))
1375                         video_ids.extend(ids_in_page)
1376
1377                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1378                                 break
1379                         pagenum = pagenum + 1
1380
1381                 for id in video_ids:
1382                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1383                 return
1384
1385 class YoutubeUserIE(InfoExtractor):
1386         """Information Extractor for YouTube users."""
1387
1388         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1389         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1390         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1391         _youtube_ie = None
1392
1393         def __init__(self, youtube_ie, downloader=None):
1394                 InfoExtractor.__init__(self, downloader)
1395                 self._youtube_ie = youtube_ie
1396
1397         @staticmethod
1398         def suitable(url):
1399                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1400
1401         def report_download_page(self, username):
1402                 """Report attempt to download user page."""
1403                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1404
1405         def _real_initialize(self):
1406                 self._youtube_ie.initialize()
1407
1408         def _real_extract(self, url):
1409                 # Extract username
1410                 mobj = re.match(self._VALID_URL, url)
1411                 if mobj is None:
1412                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1413                         return
1414
1415                 # Download user page
1416                 username = mobj.group(1)
1417                 video_ids = []
1418                 pagenum = 1
1419
1420                 self.report_download_page(username)
1421                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1422                 try:
1423                         page = urllib2.urlopen(request).read()
1424                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1425                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1426                         return
1427
1428                 # Extract video identifiers
1429                 ids_in_page = []
1430
1431                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1432                         if mobj.group(1) not in ids_in_page:
1433                                 ids_in_page.append(mobj.group(1))
1434                 video_ids.extend(ids_in_page)
1435
1436                 for id in video_ids:
1437                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1438                 return
1439
1440 class PostProcessor(object):
1441         """Post Processor class.
1442
1443         PostProcessor objects can be added to downloaders with their
1444         add_post_processor() method. When the downloader has finished a
1445         successful download, it will take its internal chain of PostProcessors
1446         and start calling the run() method on each one of them, first with
1447         an initial argument and then with the returned value of the previous
1448         PostProcessor.
1449
1450         The chain will be stopped if one of them ever returns None or the end
1451         of the chain is reached.
1452
1453         PostProcessor objects follow a "mutual registration" process similar
1454         to InfoExtractor objects.
1455         """
1456
1457         _downloader = None
1458
1459         def __init__(self, downloader=None):
1460                 self._downloader = downloader
1461
1462         def set_downloader(self, downloader):
1463                 """Sets the downloader for this PP."""
1464                 self._downloader = downloader
1465
1466         def run(self, information):
1467                 """Run the PostProcessor.
1468
1469                 The "information" argument is a dictionary like the ones
1470                 composed by InfoExtractors. The only difference is that this
1471                 one has an extra field called "filepath" that points to the
1472                 downloaded file.
1473
1474                 When this method returns None, the postprocessing chain is
1475                 stopped. However, this method may return an information
1476                 dictionary that will be passed to the next postprocessing
1477                 object in the chain. It can be the one it received after
1478                 changing some fields.
1479
1480                 In addition, this method may raise a PostProcessingError
1481                 exception that will be taken into account by the downloader
1482                 it was called from.
1483                 """
1484                 return information # by default, do nothing
1485
1486 ### MAIN PROGRAM ###
1487 if __name__ == '__main__':
1488         try:
1489                 # Modules needed only when running the main program
1490                 import getpass
1491                 import optparse
1492
1493                 # Function to update the program file with the latest version from bitbucket.org
1494                 def update_self(downloader, filename):
1495                         # Note: downloader only used for options
1496                         if not os.access (filename, os.W_OK):
1497                                 sys.exit('ERROR: no write permissions on %s' % filename)
1498
1499                         downloader.to_stdout('Updating to latest stable version...')
1500                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1501                         latest_version = urllib.urlopen(latest_url).read().strip()
1502                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1503                         newcontent = urllib.urlopen(prog_url).read()
1504                         stream = open(filename, 'w')
1505                         stream.write(newcontent)
1506                         stream.close()
1507                         downloader.to_stdout('Updated to version %s' % latest_version)
1508
1509                 # General configuration
1510                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1511                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1512                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1513
1514                 # Parse command line
1515                 parser = optparse.OptionParser(
1516                         usage='Usage: %prog [options] url...',
1517                         version='2010.02.13',
1518                         conflict_handler='resolve',
1519                 )
1520
1521                 parser.add_option('-h', '--help',
1522                                 action='help', help='print this help text and exit')
1523                 parser.add_option('-v', '--version',
1524                                 action='version', help='print program version and exit')
1525                 parser.add_option('-U', '--update',
1526                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1527                 parser.add_option('-i', '--ignore-errors',
1528                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1529                 parser.add_option('-r', '--rate-limit',
1530                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1531
1532                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1533                 authentication.add_option('-u', '--username',
1534                                 dest='username', metavar='UN', help='account username')
1535                 authentication.add_option('-p', '--password',
1536                                 dest='password', metavar='PW', help='account password')
1537                 authentication.add_option('-n', '--netrc',
1538                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1539                 parser.add_option_group(authentication)
1540
1541                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1542                 video_format.add_option('-f', '--format',
1543                                 action='store', dest='format', metavar='FMT', help='video format code')
1544                 video_format.add_option('-b', '--best-quality',
1545                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1546                 video_format.add_option('-m', '--mobile-version',
1547                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1548                 video_format.add_option('-d', '--high-def',
1549                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1550                 parser.add_option_group(video_format)
1551
1552                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1553                 verbosity.add_option('-q', '--quiet',
1554                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1555                 verbosity.add_option('-s', '--simulate',
1556                                 action='store_true', dest='simulate', help='do not download video', default=False)
1557                 verbosity.add_option('-g', '--get-url',
1558                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1559                 verbosity.add_option('-e', '--get-title',
1560                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1561                 parser.add_option_group(verbosity)
1562
1563                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1564                 filesystem.add_option('-t', '--title',
1565                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1566                 filesystem.add_option('-l', '--literal',
1567                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1568                 filesystem.add_option('-o', '--output',
1569                                 dest='outtmpl', metavar='TPL', help='output filename template')
1570                 filesystem.add_option('-a', '--batch-file',
1571                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1572                 filesystem.add_option('-w', '--no-overwrites',
1573                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1574                 filesystem.add_option('-c', '--continue',
1575                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1576                 parser.add_option_group(filesystem)
1577
1578                 (opts, args) = parser.parse_args()
1579
1580                 # Batch file verification
1581                 batchurls = []
1582                 if opts.batchfile is not None:
1583                         try:
1584                                 batchurls = open(opts.batchfile, 'r').readlines()
1585                                 batchurls = [x.strip() for x in batchurls]
1586                                 batchurls = [x for x in batchurls if len(x) > 0]
1587                         except IOError:
1588                                 sys.exit(u'ERROR: batch file could not be read')
1589                 all_urls = batchurls + args
1590
1591                 # Conflicting, missing and erroneous options
1592                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1593                         parser.error(u'using .netrc conflicts with giving username/password')
1594                 if opts.password is not None and opts.username is None:
1595                         parser.error(u'account username missing')
1596                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1597                         parser.error(u'using output template conflicts with using title or literal title')
1598                 if opts.usetitle and opts.useliteral:
1599                         parser.error(u'using title conflicts with using literal title')
1600                 if opts.username is not None and opts.password is None:
1601                         opts.password = getpass.getpass(u'Type account password and press return:')
1602                 if opts.ratelimit is not None:
1603                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1604                         if numeric_limit is None:
1605                                 parser.error(u'invalid rate limit specified')
1606                         opts.ratelimit = numeric_limit
1607
1608                 # Information extractors
1609                 youtube_ie = YoutubeIE()
1610                 metacafe_ie = MetacafeIE(youtube_ie)
1611                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1612                 youtube_user_ie = YoutubeUserIE(youtube_ie)
1613                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1614                 google_ie = GoogleIE()
1615                 photobucket_ie = PhotobucketIE()
1616                 generic_ie = GenericIE()
1617
1618                 # File downloader
1619                 fd = FileDownloader({
1620                         'usenetrc': opts.usenetrc,
1621                         'username': opts.username,
1622                         'password': opts.password,
1623                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1624                         'forceurl': opts.geturl,
1625                         'forcetitle': opts.gettitle,
1626                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1627                         'format': opts.format,
1628                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1629                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1630                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1631                                 or u'%(id)s.%(ext)s'),
1632                         'ignoreerrors': opts.ignoreerrors,
1633                         'ratelimit': opts.ratelimit,
1634                         'nooverwrites': opts.nooverwrites,
1635                         'continuedl': opts.continue_dl,
1636                         })
1637                 fd.add_info_extractor(youtube_search_ie)
1638                 fd.add_info_extractor(youtube_pl_ie)
1639                 fd.add_info_extractor(youtube_user_ie)
1640                 fd.add_info_extractor(metacafe_ie)
1641                 fd.add_info_extractor(youtube_ie)
1642                 fd.add_info_extractor(google_ie)
1643                 fd.add_info_extractor(photobucket_ie)
1644
1645                 # This must come last since it's the
1646                 # fallback if none of the others work
1647                 fd.add_info_extractor(generic_ie)
1648
1649                 # Update version
1650                 if opts.update_self:
1651                         update_self(fd, sys.argv[0])
1652
1653                 # Maybe do nothing
1654                 if len(all_urls) < 1:
1655                         if not opts.update_self:
1656                                 parser.error(u'you must provide at least one URL')
1657                         else:
1658                                 sys.exit()
1659                 retcode = fd.download(all_urls)
1660                 sys.exit(retcode)
1661
1662         except DownloadError:
1663                 sys.exit(1)
1664         except SameFileError:
1665                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1666         except KeyboardInterrupt:
1667                 sys.exit(u'\nERROR: Interrupted by user')