_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 class DownloadError(Exception):
  55         """Download Error exception.
  56
  57         This exception may be thrown by FileDownloader objects if they are not
  58         configured to continue on errors. They will contain the appropriate
  59         error message.
  60         """
  61         pass
  62
  63 class SameFileError(Exception):
  64         """Same File exception.
  65
  66         This exception will be thrown by FileDownloader objects if they detect
  67         multiple files would have to be downloaded to the same file on disk.
  68         """
  69         pass
  70
  71 class PostProcessingError(Exception):
  72         """Post Processing exception.
  73
  74         This exception may be raised by PostProcessor's .run() method to
  75         indicate an error in the postprocessing task.
  76         """
  77         pass
  78
  79 class UnavailableFormatError(Exception):
  80         """Unavailable Format exception.
  81
  82         This exception will be thrown when a video is requested
  83         in a format that is not available for that video.
  84         """
  85         pass
  86
  87 class ContentTooShortError(Exception):
  88         """Content Too Short exception.
  89
  90         This exception may be raised by FileDownloader objects when a file they
  91         download is too small for what the server announced first, indicating
  92         the connection was probably interrupted.
  93         """
  94         # Both in bytes
  95         downloaded = None
  96         expected = None
  97
  98         def __init__(self, downloaded, expected):
  99                 self.downloaded = downloaded
 100                 self.expected = expected
 101
 102 class FileDownloader(object):
 103         """File Downloader class.
 104
 105         File downloader objects are the ones responsible of downloading the
 106         actual video file and writing it to disk if the user has requested
 107         it, among some other tasks. In most cases there should be one per
 108         program. As, given a video URL, the downloader doesn't know how to
 109         extract all the needed information, task that InfoExtractors do, it
 110         has to pass the URL to one of them.
 111
 112         For this, file downloader objects have a method that allows
 113         InfoExtractors to be registered in a given order. When it is passed
 114         a URL, the file downloader handles it to the first InfoExtractor it
 115         finds that reports being able to handle it. The InfoExtractor extracts
 116         all the information about the video or videos the URL refers to, and
 117         asks the FileDownloader to process the video information, possibly
 118         downloading the video.
 119
 120         File downloaders accept a lot of parameters. In order not to saturate
 121         the object constructor with arguments, it receives a dictionary of
 122         options instead. These options are available through the params
 123         attribute for the InfoExtractors to use. The FileDownloader also
 124         registers itself as the downloader in charge for the InfoExtractors
 125         that are added to it, so this is a "mutual registration".
 126
 127         Available options:
 128
 129         username:       Username for authentication purposes.
 130         password:       Password for authentication purposes.
 131         usenetrc:       Use netrc for authentication instead.
 132         quiet:          Do not print messages to stdout.
 133         forceurl:       Force printing final URL.
 134         forcetitle:     Force printing title.
 135         simulate:       Do not download the video files.
 136         format:         Video format code.
 137         outtmpl:        Template for output names.
 138         ignoreerrors:   Do not stop on download errors.
 139         ratelimit:      Download speed limit, in bytes/sec.
 140         nooverwrites:   Prevent overwriting files.
 141         continuedl:     Try to continue downloads if possible.
 142         """
 143
 144         params = None
 145         _ies = []
 146         _pps = []
 147         _download_retcode = None
 148
 149         def __init__(self, params):
 150                 """Create a FileDownloader object with the given options."""
 151                 self._ies = []
 152                 self._pps = []
 153                 self._download_retcode = 0
 154                 self.params = params
 155
 156         @staticmethod
 157         def pmkdir(filename):
 158                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 159                 components = filename.split(os.sep)
 160                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 161                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 162                 for dir in aggregate:
 163                         if not os.path.exists(dir):
 164                                 os.mkdir(dir)
 165
 166         @staticmethod
 167         def format_bytes(bytes):
 168                 if bytes is None:
 169                         return 'N/A'
 170                 if type(bytes) is str:
 171                         bytes = float(bytes)
 172                 if bytes == 0.0:
 173                         exponent = 0
 174                 else:
 175                         exponent = long(math.log(bytes, 1024.0))
 176                 suffix = 'bkMGTPEZY'[exponent]
 177                 converted = float(bytes) / float(1024**exponent)
 178                 return '%.2f%s' % (converted, suffix)
 179
 180         @staticmethod
 181         def calc_percent(byte_counter, data_len):
 182                 if data_len is None:
 183                         return '---.-%'
 184                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 185
 186         @staticmethod
 187         def calc_eta(start, now, total, current):
 188                 if total is None:
 189                         return '--:--'
 190                 dif = now - start
 191                 if current == 0 or dif < 0.001: # One millisecond
 192                         return '--:--'
 193                 rate = float(current) / dif
 194                 eta = long((float(total) - float(current)) / rate)
 195                 (eta_mins, eta_secs) = divmod(eta, 60)
 196                 if eta_mins > 99:
 197                         return '--:--'
 198                 return '%02d:%02d' % (eta_mins, eta_secs)
 199
 200         @staticmethod
 201         def calc_speed(start, now, bytes):
 202                 dif = now - start
 203                 if bytes == 0 or dif < 0.001: # One millisecond
 204                         return '%10s' % '---b/s'
 205                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 206
 207         @staticmethod
 208         def best_block_size(elapsed_time, bytes):
 209                 new_min = max(bytes / 2.0, 1.0)
 210                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 211                 if elapsed_time < 0.001:
 212                         return long(new_max)
 213                 rate = bytes / elapsed_time
 214                 if rate > new_max:
 215                         return long(new_max)
 216                 if rate < new_min:
 217                         return long(new_min)
 218                 return long(rate)
 219
 220         @staticmethod
 221         def parse_bytes(bytestr):
 222                 """Parse a string indicating a byte quantity into a long integer."""
 223                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 224                 if matchobj is None:
 225                         return None
 226                 number = float(matchobj.group(1))
 227                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 228                 return long(round(number * multiplier))
 229
 230         @staticmethod
 231         def verify_url(url):
 232                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 233                 request = urllib2.Request(url, None, std_headers)
 234                 data = urllib2.urlopen(request)
 235                 data.read(1)
 236                 url = data.geturl()
 237                 data.close()
 238                 return url
 239
 240         def add_info_extractor(self, ie):
 241                 """Add an InfoExtractor object to the end of the list."""
 242                 self._ies.append(ie)
 243                 ie.set_downloader(self)
 244
 245         def add_post_processor(self, pp):
 246                 """Add a PostProcessor object to the end of the chain."""
 247                 self._pps.append(pp)
 248                 pp.set_downloader(self)
 249
 250         def to_stdout(self, message, skip_eol=False):
 251                 """Print message to stdout if not in quiet mode."""
 252                 if not self.params.get('quiet', False):
 253                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 254                         sys.stdout.flush()
 255
 256         def to_stderr(self, message):
 257                 """Print message to stderr."""
 258                 print >>sys.stderr, message.encode(preferredencoding())
 259
 260         def fixed_template(self):
 261                 """Checks if the output template is fixed."""
 262                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 263
 264         def trouble(self, message=None):
 265                 """Determine action to take when a download problem appears.
 266
 267                 Depending on if the downloader has been configured to ignore
 268                 download errors or not, this method may throw an exception or
 269                 not when errors are found, after printing the message.
 270                 """
 271                 if message is not None:
 272                         self.to_stderr(message)
 273                 if not self.params.get('ignoreerrors', False):
 274                         raise DownloadError(message)
 275                 self._download_retcode = 1
 276
 277         def slow_down(self, start_time, byte_counter):
 278                 """Sleep if the download speed is over the rate limit."""
 279                 rate_limit = self.params.get('ratelimit', None)
 280                 if rate_limit is None or byte_counter == 0:
 281                         return
 282                 now = time.time()
 283                 elapsed = now - start_time
 284                 if elapsed <= 0.0:
 285                         return
 286                 speed = float(byte_counter) / elapsed
 287                 if speed > rate_limit:
 288                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 289
 290         def report_destination(self, filename):
 291                 """Report destination filename."""
 292                 self.to_stdout(u'[download] Destination: %s' % filename)
 293
 294         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 295                 """Report download progress."""
 296                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 297                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 298
 299         def report_resuming_byte(self, resume_len):
 300                 """Report attemtp to resume at given byte."""
 301                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 302
 303         def report_file_already_downloaded(self, file_name):
 304                 """Report file has already been fully downloaded."""
 305                 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 306
 307         def report_unable_to_resume(self):
 308                 """Report it was impossible to resume download."""
 309                 self.to_stdout(u'[download] Unable to resume')
 310
 311         def report_finish(self):
 312                 """Report download finished."""
 313                 self.to_stdout(u'')
 314
 315         def process_info(self, info_dict):
 316                 """Process a single dictionary returned by an InfoExtractor."""
 317                 # Do nothing else if in simulate mode
 318                 if self.params.get('simulate', False):
 319                         try:
 320                                 info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 321                         except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 322                                 raise UnavailableFormatError
 323
 324                         # Forced printings
 325                         if self.params.get('forcetitle', False):
 326                                 print info_dict['title'].encode(preferredencoding())
 327                         if self.params.get('forceurl', False):
 328                                 print info_dict['url'].encode(preferredencoding())
 329
 330                         return
 331
 332                 try:
 333                         template_dict = dict(info_dict)
 334                         template_dict['epoch'] = unicode(long(time.time()))
 335                         filename = self.params['outtmpl'] % template_dict
 336                 except (ValueError, KeyError), err:
 337                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 338                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 339                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 340                         return
 341
 342                 try:
 343                         self.pmkdir(filename)
 344                 except (OSError, IOError), err:
 345                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 346                         return
 347
 348                 try:
 349                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 350                 except (OSError, IOError), err:
 351                         raise UnavailableFormatError
 352                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 353                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 354                         return
 355                 except (ContentTooShortError, ), err:
 356                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 357                         return
 358
 359                 if success:
 360                         try:
 361                                 self.post_process(filename, info_dict)
 362                         except (PostProcessingError), err:
 363                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 364                                 return
 365
 366         def download(self, url_list):
 367                 """Download a given list of URLs."""
 368                 if len(url_list) > 1 and self.fixed_template():
 369                         raise SameFileError(self.params['outtmpl'])
 370
 371                 for url in url_list:
 372                         suitable_found = False
 373                         for ie in self._ies:
 374                                 # Go to next InfoExtractor if not suitable
 375                                 if not ie.suitable(url):
 376                                         continue
 377
 378                                 # Suitable InfoExtractor found
 379                                 suitable_found = True
 380
 381                                 # Extract information from URL and process it
 382                                 ie.extract(url)
 383
 384                                 # Suitable InfoExtractor had been found; go to next URL
 385                                 break
 386
 387                         if not suitable_found:
 388                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 389
 390                 return self._download_retcode
 391
 392         def post_process(self, filename, ie_info):
 393                 """Run the postprocessing chain on the given file."""
 394                 info = dict(ie_info)
 395                 info['filepath'] = filename
 396                 for pp in self._pps:
 397                         info = pp.run(info)
 398                         if info is None:
 399                                 break
 400
 401         def _download_with_rtmpdump(self, filename, url):
 402                 self.report_destination(filename)
 403
 404                 # Check for rtmpdump first
 405                 try:
 406                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 407                 except (OSError, IOError):
 408                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 409                         return False
 410
 411                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 412                 # the connection was interrumpted and resuming appears to be
 413                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 414                 retval = subprocess.call(['rtmpdump', '-q', '-r', url, '-o', filename] + [[], ['-e']][self.params.get('continuedl', False)])
 415                 while retval == 2:
 416                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 417                         time.sleep(2.0) # This seems to be needed
 418                         retval = subprocess.call(['rtmpdump', '-q', '-e', '-r', url, '-o', filename])
 419                 if retval == 0:
 420                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 421                         return True
 422                 else:
 423                         self.trouble('ERROR: rtmpdump exited with code %d' % retval)
 424                         return False
 425
 426         def _do_download(self, filename, url):
 427                 # Attempt to download using rtmpdump
 428                 if url.startswith('rtmp'):
 429                         return self._download_with_rtmpdump(filename, url)
 430
 431                 stream = None
 432                 open_mode = 'wb'
 433                 basic_request = urllib2.Request(url, None, std_headers)
 434                 request = urllib2.Request(url, None, std_headers)
 435
 436                 # Establish possible resume length
 437                 if os.path.isfile(filename):
 438                         resume_len = os.path.getsize(filename)
 439                 else:
 440                         resume_len = 0
 441
 442                 # Request parameters in case of being able to resume
 443                 if self.params.get('continuedl', False) and resume_len != 0:
 444                         self.report_resuming_byte(resume_len)
 445                         request.add_header('Range','bytes=%d-' % resume_len)
 446                         open_mode = 'ab'
 447
 448                 # Establish connection
 449                 try:
 450                         data = urllib2.urlopen(request)
 451                 except (urllib2.HTTPError, ), err:
 452                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 453                                 raise
 454                         # Unable to resume
 455                         data = urllib2.urlopen(basic_request)
 456                         content_length = data.info()['Content-Length']
 457
 458                         if content_length is not None and long(content_length) == resume_len:
 459                                 # Because the file had already been fully downloaded
 460                                 self.report_file_already_downloaded(filename)
 461                                 return True
 462                         else:
 463                                 # Because the server didn't let us
 464                                 self.report_unable_to_resume()
 465                                 open_mode = 'wb'
 466
 467                 data_len = data.info().get('Content-length', None)
 468                 data_len_str = self.format_bytes(data_len)
 469                 byte_counter = 0
 470                 block_size = 1024
 471                 start = time.time()
 472                 while True:
 473                         # Download and write
 474                         before = time.time()
 475                         data_block = data.read(block_size)
 476                         after = time.time()
 477                         data_block_len = len(data_block)
 478                         if data_block_len == 0:
 479                                 break
 480                         byte_counter += data_block_len
 481
 482                         # Open file just in time
 483                         if stream is None:
 484                                 try:
 485                                         stream = open(filename, open_mode)
 486                                         self.report_destination(filename)
 487                                 except (OSError, IOError), err:
 488                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 489                                         return False
 490                         stream.write(data_block)
 491                         block_size = self.best_block_size(after - before, data_block_len)
 492
 493                         # Progress message
 494                         percent_str = self.calc_percent(byte_counter, data_len)
 495                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 496                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 497                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 498
 499                         # Apply rate limit
 500                         self.slow_down(start, byte_counter)
 501
 502                 self.report_finish()
 503                 if data_len is not None and str(byte_counter) != data_len:
 504                         raise ContentTooShortError(byte_counter, long(data_len))
 505                 return True
 506
 507 class InfoExtractor(object):
 508         """Information Extractor class.
 509
 510         Information extractors are the classes that, given a URL, extract
 511         information from the video (or videos) the URL refers to. This
 512         information includes the real video URL, the video title and simplified
 513         title, author and others. The information is stored in a dictionary
 514         which is then passed to the FileDownloader. The FileDownloader
 515         processes this information possibly downloading the video to the file
 516         system, among other possible outcomes. The dictionaries must include
 517         the following fields:
 518
 519         id:             Video identifier.
 520         url:            Final video URL.
 521         uploader:       Nickname of the video uploader.
 522         title:          Literal title.
 523         stitle:         Simplified title.
 524         ext:            Video filename extension.
 525
 526         Subclasses of this one should re-define the _real_initialize() and
 527         _real_extract() methods, as well as the suitable() static method.
 528         Probably, they should also be instantiated and added to the main
 529         downloader.
 530         """
 531
 532         _ready = False
 533         _downloader = None
 534
 535         def __init__(self, downloader=None):
 536                 """Constructor. Receives an optional downloader."""
 537                 self._ready = False
 538                 self.set_downloader(downloader)
 539
 540         @staticmethod
 541         def suitable(url):
 542                 """Receives a URL and returns True if suitable for this IE."""
 543                 return False
 544
 545         def initialize(self):
 546                 """Initializes an instance (authentication, etc)."""
 547                 if not self._ready:
 548                         self._real_initialize()
 549                         self._ready = True
 550
 551         def extract(self, url):
 552                 """Extracts URL information and returns it in list of dicts."""
 553                 self.initialize()
 554                 return self._real_extract(url)
 555
 556         def set_downloader(self, downloader):
 557                 """Sets the downloader for this IE."""
 558                 self._downloader = downloader
 559
 560         def _real_initialize(self):
 561                 """Real initialization process. Redefine in subclasses."""
 562                 pass
 563
 564         def _real_extract(self, url):
 565                 """Real extraction process. Redefine in subclasses."""
 566                 pass
 567
 568 class YoutubeIE(InfoExtractor):
 569         """Information extractor for youtube.com."""
 570
 571         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 572         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 573         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 574         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 575         _NETRC_MACHINE = 'youtube'
 576         _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
 577         _video_extensions = {
 578                 '13': '3gp',
 579                 '17': 'mp4',
 580                 '18': 'mp4',
 581                 '22': 'mp4',
 582                 '37': 'mp4',
 583         }
 584
 585         @staticmethod
 586         def suitable(url):
 587                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 588
 589         @staticmethod
 590         def htmlentity_transform(matchobj):
 591                 """Transforms an HTML entity to a Unicode character."""
 592                 entity = matchobj.group(1)
 593
 594                 # Known non-numeric HTML entity
 595                 if entity in htmlentitydefs.name2codepoint:
 596                         return unichr(htmlentitydefs.name2codepoint[entity])
 597
 598                 # Unicode character
 599                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 600                 if mobj is not None:
 601                         numstr = mobj.group(1)
 602                         if numstr.startswith(u'x'):
 603                                 base = 16
 604                                 numstr = u'0%s' % numstr
 605                         else:
 606                                 base = 10
 607                         return unichr(long(numstr, base))
 608
 609                 # Unknown entity in name, return its literal representation
 610                 return (u'&%s;' % entity)
 611
 612         def report_lang(self):
 613                 """Report attempt to set language."""
 614                 self._downloader.to_stdout(u'[youtube] Setting language')
 615
 616         def report_login(self):
 617                 """Report attempt to log in."""
 618                 self._downloader.to_stdout(u'[youtube] Logging in')
 619
 620         def report_age_confirmation(self):
 621                 """Report attempt to confirm age."""
 622                 self._downloader.to_stdout(u'[youtube] Confirming age')
 623
 624         def report_video_info_webpage_download(self, video_id):
 625                 """Report attempt to download video info webpage."""
 626                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 627
 628         def report_information_extraction(self, video_id):
 629                 """Report attempt to extract video information."""
 630                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 631
 632         def report_unavailable_format(self, video_id, format):
 633                 """Report extracted video URL."""
 634                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 635
 636         def report_rtmp_download(self):
 637                 """Indicate the download will use the RTMP protocol."""
 638                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 639
 640         def _real_initialize(self):
 641                 if self._downloader is None:
 642                         return
 643
 644                 username = None
 645                 password = None
 646                 downloader_params = self._downloader.params
 647
 648                 # Attempt to use provided username and password or .netrc data
 649                 if downloader_params.get('username', None) is not None:
 650                         username = downloader_params['username']
 651                         password = downloader_params['password']
 652                 elif downloader_params.get('usenetrc', False):
 653                         try:
 654                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 655                                 if info is not None:
 656                                         username = info[0]
 657                                         password = info[2]
 658                                 else:
 659                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 660                         except (IOError, netrc.NetrcParseError), err:
 661                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 662                                 return
 663
 664                 # Set language
 665                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 666                 try:
 667                         self.report_lang()
 668                         urllib2.urlopen(request).read()
 669                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 670                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 671                         return
 672
 673                 # No authentication to be performed
 674                 if username is None:
 675                         return
 676
 677                 # Log in
 678                 login_form = {
 679                                 'current_form': 'loginForm',
 680                                 'next':         '/',
 681                                 'action_login': 'Log In',
 682                                 'username':     username,
 683                                 'password':     password,
 684                                 }
 685                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 686                 try:
 687                         self.report_login()
 688                         login_results = urllib2.urlopen(request).read()
 689                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 690                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 691                                 return
 692                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 693                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 694                         return
 695
 696                 # Confirm age
 697                 age_form = {
 698                                 'next_url':             '/',
 699                                 'action_confirm':       'Confirm',
 700                                 }
 701                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 702                 try:
 703                         self.report_age_confirmation()
 704                         age_results = urllib2.urlopen(request).read()
 705                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 706                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 707                         return
 708
 709         def _real_extract(self, url):
 710                 # Extract video id from URL
 711                 mobj = re.match(self._VALID_URL, url)
 712                 if mobj is None:
 713                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 714                         return
 715                 video_id = mobj.group(2)
 716
 717                 # Downloader parameters
 718                 best_quality = False
 719                 format_param = None
 720                 quality_index = 0
 721                 if self._downloader is not None:
 722                         params = self._downloader.params
 723                         format_param = params.get('format', None)
 724                         if format_param == '0':
 725                                 format_param = self._available_formats[quality_index]
 726                                 best_quality = True
 727
 728                 while True:
 729                         # Extension
 730                         video_extension = self._video_extensions.get(format_param, 'flv')
 731
 732                         # Get video info
 733                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
 734                         request = urllib2.Request(video_info_url, None, std_headers)
 735                         try:
 736                                 self.report_video_info_webpage_download(video_id)
 737                                 video_info_webpage = urllib2.urlopen(request).read()
 738                                 video_info = parse_qs(video_info_webpage)
 739                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 740                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 741                                 return
 742                         self.report_information_extraction(video_id)
 743
 744                         # "t" param
 745                         if 'token' not in video_info:
 746                                 # Attempt to see if YouTube has issued an error message
 747                                 if 'reason' not in video_info:
 748                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 749                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 750                                         stream.write(video_info_webpage)
 751                                         stream.close()
 752                                 else:
 753                                         reason = urllib.unquote_plus(video_info['reason'][0])
 754                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 755                                 return
 756                         token = urllib.unquote_plus(video_info['token'][0])
 757                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 758                         if format_param is not None:
 759                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 760
 761                         # Check possible RTMP download
 762                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 763                                 self.report_rtmp_download()
 764                                 video_real_url = video_info['conn'][0]
 765
 766                         # uploader
 767                         if 'author' not in video_info:
 768                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 769                                 return
 770                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 771
 772                         # title
 773                         if 'title' not in video_info:
 774                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 775                                 return
 776                         video_title = urllib.unquote_plus(video_info['title'][0])
 777                         video_title = video_title.decode('utf-8')
 778                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 779                         video_title = video_title.replace(os.sep, u'%')
 780
 781                         # simplified title
 782                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 783                         simple_title = simple_title.strip(ur'_')
 784
 785                         try:
 786                                 # Process video information
 787                                 self._downloader.process_info({
 788                                         'id':           video_id.decode('utf-8'),
 789                                         'url':          video_real_url.decode('utf-8'),
 790                                         'uploader':     video_uploader.decode('utf-8'),
 791                                         'title':        video_title,
 792                                         'stitle':       simple_title,
 793                                         'ext':          video_extension.decode('utf-8'),
 794                                 })
 795
 796                                 return
 797
 798                         except UnavailableFormatError, err:
 799                                 if best_quality:
 800                                         if quality_index == len(self._available_formats) - 1:
 801                                                 # I don't ever expect this to happen
 802                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 803                                                 return
 804                                         else:
 805                                                 self.report_unavailable_format(video_id, format_param)
 806                                                 quality_index += 1
 807                                                 format_param = self._available_formats[quality_index]
 808                                                 continue
 809                                 else:
 810                                         self._downloader.trouble('ERROR: format not available for video')
 811                                         return
 812
 813
 814 class MetacafeIE(InfoExtractor):
 815         """Information Extractor for metacafe.com."""
 816
 817         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 818         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 819         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 820         _youtube_ie = None
 821
 822         def __init__(self, youtube_ie, downloader=None):
 823                 InfoExtractor.__init__(self, downloader)
 824                 self._youtube_ie = youtube_ie
 825
 826         @staticmethod
 827         def suitable(url):
 828                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 829
 830         def report_disclaimer(self):
 831                 """Report disclaimer retrieval."""
 832                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 833
 834         def report_age_confirmation(self):
 835                 """Report attempt to confirm age."""
 836                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 837
 838         def report_download_webpage(self, video_id):
 839                 """Report webpage download."""
 840                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 841
 842         def report_extraction(self, video_id):
 843                 """Report information extraction."""
 844                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 845
 846         def _real_initialize(self):
 847                 # Retrieve disclaimer
 848                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 849                 try:
 850                         self.report_disclaimer()
 851                         disclaimer = urllib2.urlopen(request).read()
 852                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 853                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 854                         return
 855
 856                 # Confirm age
 857                 disclaimer_form = {
 858                         'filters': '0',
 859                         'submit': "Continue - I'm over 18",
 860                         }
 861                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 862                 try:
 863                         self.report_age_confirmation()
 864                         disclaimer = urllib2.urlopen(request).read()
 865                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 866                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 867                         return
 868
 869         def _real_extract(self, url):
 870                 # Extract id and simplified title from URL
 871                 mobj = re.match(self._VALID_URL, url)
 872                 if mobj is None:
 873                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 874                         return
 875
 876                 video_id = mobj.group(1)
 877
 878                 # Check if video comes from YouTube
 879                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 880                 if mobj2 is not None:
 881                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 882                         return
 883
 884                 simple_title = mobj.group(2).decode('utf-8')
 885                 video_extension = 'flv'
 886
 887                 # Retrieve video webpage to extract further information
 888                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 889                 try:
 890                         self.report_download_webpage(video_id)
 891                         webpage = urllib2.urlopen(request).read()
 892                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 893                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 894                         return
 895
 896                 # Extract URL, uploader and title from webpage
 897                 self.report_extraction(video_id)
 898                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 899                 if mobj is None:
 900                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 901                         return
 902                 mediaURL = urllib.unquote(mobj.group(1))
 903
 904                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 905                 #if mobj is None:
 906                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 907                 #       return
 908                 #gdaKey = mobj.group(1)
 909                 #
 910                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 911
 912                 video_url = mediaURL
 913
 914                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 915                 if mobj is None:
 916                         self._downloader.trouble(u'ERROR: unable to extract title')
 917                         return
 918                 video_title = mobj.group(1).decode('utf-8')
 919
 920                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 921                 if mobj is None:
 922                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 923                         return
 924                 video_uploader = mobj.group(1)
 925
 926                 try:
 927                         # Process video information
 928                         self._downloader.process_info({
 929                                 'id':           video_id.decode('utf-8'),
 930                                 'url':          video_url.decode('utf-8'),
 931                                 'uploader':     video_uploader.decode('utf-8'),
 932                                 'title':        video_title,
 933                                 'stitle':       simple_title,
 934                                 'ext':          video_extension.decode('utf-8'),
 935                         })
 936                 except UnavailableFormatError:
 937                         self._downloader.trouble(u'ERROR: format not available for video')
 938
 939
 940 class GoogleIE(InfoExtractor):
 941         """Information extractor for video.google.com."""
 942
 943         _VALID_URL = r'(?:http://)?video\.google\.com/videoplay\?docid=([^\&]+).*'
 944
 945         def __init__(self, downloader=None):
 946                 InfoExtractor.__init__(self, downloader)
 947
 948         @staticmethod
 949         def suitable(url):
 950                 return (re.match(GoogleIE._VALID_URL, url) is not None)
 951
 952         def report_download_webpage(self, video_id):
 953                 """Report webpage download."""
 954                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
 955
 956         def report_extraction(self, video_id):
 957                 """Report information extraction."""
 958                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
 959
 960         def _real_initialize(self):
 961                 return
 962
 963         def _real_extract(self, url):
 964                 # Extract id from URL
 965                 mobj = re.match(self._VALID_URL, url)
 966                 if mobj is None:
 967                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 968                         return
 969
 970                 video_id = mobj.group(1)
 971
 972                 video_extension = 'mp4'
 973
 974                 # Retrieve video webpage to extract further information
 975                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s' % video_id)
 976                 try:
 977                         self.report_download_webpage(video_id)
 978                         webpage = urllib2.urlopen(request).read()
 979                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 980                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 981                         return
 982
 983                 # Extract URL, uploader, and title from webpage
 984                 self.report_extraction(video_id)
 985                 mobj = re.search(r"download_url:'(.*)'", webpage)
 986                 if mobj is None:
 987                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 988                         return
 989                 mediaURL = urllib.unquote(mobj.group(1))
 990                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 991                 mediaURL = mediaURL.replace('\\x26', '\x26')
 992
 993                 video_url = mediaURL
 994
 995                 mobj = re.search(r'<title>(.*)</title>', webpage)
 996                 if mobj is None:
 997                         self._downloader.trouble(u'ERROR: unable to extract title')
 998                         return
 999                 video_title = mobj.group(1).decode('utf-8')
1000
1001                 # Google Video doesn't show uploader nicknames?
1002                 video_uploader = 'uploader'
1003
1004                 try:
1005                         # Process video information
1006                         self._downloader.process_info({
1007                                 'id':           video_id.decode('utf-8'),
1008                                 'url':          video_url.decode('utf-8'),
1009                                 'uploader':     video_uploader.decode('utf-8'),
1010                                 'title':        video_title.decode('utf-8'),
1011                                 'stitle':       video_title.decode('utf-8'),
1012                                 'ext':          video_extension.decode('utf-8'),
1013                         })
1014                 except UnavailableFormatError:
1015                         self._downloader.trouble(u'ERROR: format not available for video')
1016
1017
1018 class PhotobucketIE(InfoExtractor):
1019         """Information extractor for photobucket.com."""
1020
1021         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1022
1023         def __init__(self, downloader=None):
1024                 InfoExtractor.__init__(self, downloader)
1025
1026         @staticmethod
1027         def suitable(url):
1028                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1029
1030         def report_download_webpage(self, video_id):
1031                 """Report webpage download."""
1032                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1033
1034         def report_extraction(self, video_id):
1035                 """Report information extraction."""
1036                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1037
1038         def _real_initialize(self):
1039                 return
1040
1041         def _real_extract(self, url):
1042                 # Extract id from URL
1043                 mobj = re.match(self._VALID_URL, url)
1044                 if mobj is None:
1045                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1046                         return
1047
1048                 video_id = mobj.group(1)
1049
1050                 video_extension = 'flv'
1051
1052                 # Retrieve video webpage to extract further information
1053                 request = urllib2.Request(url)
1054                 try:
1055                         self.report_download_webpage(video_id)
1056                         webpage = urllib2.urlopen(request).read()
1057                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1058                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1059                         return
1060
1061                 # Extract URL, uploader, and title from webpage
1062                 self.report_extraction(video_id)
1063                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1064                 if mobj is None:
1065                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1066                         return
1067                 mediaURL = urllib.unquote(mobj.group(1))
1068
1069                 video_url = mediaURL
1070
1071                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1072                 if mobj is None:
1073                         self._downloader.trouble(u'ERROR: unable to extract title')
1074                         return
1075                 video_title = mobj.group(1).decode('utf-8')
1076
1077                 video_uploader = mobj.group(2).decode('utf-8')
1078
1079                 try:
1080                         # Process video information
1081                         self._downloader.process_info({
1082                                 'id':           video_id.decode('utf-8'),
1083                                 'url':          video_url.decode('utf-8'),
1084                                 'uploader':     video_uploader.decode('utf-8'),
1085                                 'title':        video_title.decode('utf-8'),
1086                                 'stitle':       video_title.decode('utf-8'),
1087                                 'ext':          video_extension.decode('utf-8'),
1088                         })
1089                 except UnavailableFormatError:
1090                         self._downloader.trouble(u'ERROR: format not available for video')
1091
1092
1093 class YoutubeSearchIE(InfoExtractor):
1094         """Information Extractor for YouTube search queries."""
1095         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1096         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1097         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1098         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1099         _youtube_ie = None
1100         _max_youtube_results = 1000
1101
1102         def __init__(self, youtube_ie, downloader=None):
1103                 InfoExtractor.__init__(self, downloader)
1104                 self._youtube_ie = youtube_ie
1105
1106         @staticmethod
1107         def suitable(url):
1108                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1109
1110         def report_download_page(self, query, pagenum):
1111                 """Report attempt to download playlist page with given number."""
1112                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1113
1114         def _real_initialize(self):
1115                 self._youtube_ie.initialize()
1116
1117         def _real_extract(self, query):
1118                 mobj = re.match(self._VALID_QUERY, query)
1119                 if mobj is None:
1120                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1121                         return
1122
1123                 prefix, query = query.split(':')
1124                 prefix = prefix[8:]
1125                 if prefix == '':
1126                         self._download_n_results(query, 1)
1127                         return
1128                 elif prefix == 'all':
1129                         self._download_n_results(query, self._max_youtube_results)
1130                         return
1131                 else:
1132                         try:
1133                                 n = long(prefix)
1134                                 if n <= 0:
1135                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1136                                         return
1137                                 elif n > self._max_youtube_results:
1138                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1139                                         n = self._max_youtube_results
1140                                 self._download_n_results(query, n)
1141                                 return
1142                         except ValueError: # parsing prefix as integer fails
1143                                 self._download_n_results(query, 1)
1144                                 return
1145
1146         def _download_n_results(self, query, n):
1147                 """Downloads a specified number of results for a query"""
1148
1149                 video_ids = []
1150                 already_seen = set()
1151                 pagenum = 1
1152
1153                 while True:
1154                         self.report_download_page(query, pagenum)
1155                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1156                         request = urllib2.Request(result_url, None, std_headers)
1157                         try:
1158                                 page = urllib2.urlopen(request).read()
1159                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1161                                 return
1162
1163                         # Extract video identifiers
1164                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1165                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1166                                 if video_id not in already_seen:
1167                                         video_ids.append(video_id)
1168                                         already_seen.add(video_id)
1169                                         if len(video_ids) == n:
1170                                                 # Specified n videos reached
1171                                                 for id in video_ids:
1172                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1173                                                 return
1174
1175                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1176                                 for id in video_ids:
1177                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1178                                 return
1179
1180                         pagenum = pagenum + 1
1181
1182 class YoutubePlaylistIE(InfoExtractor):
1183         """Information Extractor for YouTube playlists."""
1184
1185         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1186         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1187         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1188         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1189         _youtube_ie = None
1190
1191         def __init__(self, youtube_ie, downloader=None):
1192                 InfoExtractor.__init__(self, downloader)
1193                 self._youtube_ie = youtube_ie
1194
1195         @staticmethod
1196         def suitable(url):
1197                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1198
1199         def report_download_page(self, playlist_id, pagenum):
1200                 """Report attempt to download playlist page with given number."""
1201                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1202
1203         def _real_initialize(self):
1204                 self._youtube_ie.initialize()
1205
1206         def _real_extract(self, url):
1207                 # Extract playlist id
1208                 mobj = re.match(self._VALID_URL, url)
1209                 if mobj is None:
1210                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1211                         return
1212
1213                 # Download playlist pages
1214                 playlist_id = mobj.group(1)
1215                 video_ids = []
1216                 pagenum = 1
1217
1218                 while True:
1219                         self.report_download_page(playlist_id, pagenum)
1220                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1221                         try:
1222                                 page = urllib2.urlopen(request).read()
1223                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1224                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1225                                 return
1226
1227                         # Extract video identifiers
1228                         ids_in_page = []
1229                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1230                                 if mobj.group(1) not in ids_in_page:
1231                                         ids_in_page.append(mobj.group(1))
1232                         video_ids.extend(ids_in_page)
1233
1234                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1235                                 break
1236                         pagenum = pagenum + 1
1237
1238                 for id in video_ids:
1239                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1240                 return
1241
1242 class YoutubeUserIE(InfoExtractor):
1243         """Information Extractor for YouTube users."""
1244
1245         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1246         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1247         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1248         _youtube_ie = None
1249
1250         def __init__(self, youtube_ie, downloader=None):
1251                 InfoExtractor.__init__(self, downloader)
1252                 self._youtube_ie = youtube_ie
1253
1254         @staticmethod
1255         def suitable(url):
1256                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1257
1258         def report_download_page(self, username):
1259                 """Report attempt to download user page."""
1260                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1261
1262         def _real_initialize(self):
1263                 self._youtube_ie.initialize()
1264
1265         def _real_extract(self, url):
1266                 # Extract username
1267                 mobj = re.match(self._VALID_URL, url)
1268                 if mobj is None:
1269                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1270                         return
1271
1272                 # Download user page
1273                 username = mobj.group(1)
1274                 video_ids = []
1275                 pagenum = 1
1276
1277                 self.report_download_page(username)
1278                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1279                 try:
1280                         page = urllib2.urlopen(request).read()
1281                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1282                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1283                         return
1284
1285                 # Extract video identifiers
1286                 ids_in_page = []
1287
1288                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1289                         if mobj.group(1) not in ids_in_page:
1290                                 ids_in_page.append(mobj.group(1))
1291                 video_ids.extend(ids_in_page)
1292
1293                 for id in video_ids:
1294                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1295                 return
1296
1297 class PostProcessor(object):
1298         """Post Processor class.
1299
1300         PostProcessor objects can be added to downloaders with their
1301         add_post_processor() method. When the downloader has finished a
1302         successful download, it will take its internal chain of PostProcessors
1303         and start calling the run() method on each one of them, first with
1304         an initial argument and then with the returned value of the previous
1305         PostProcessor.
1306
1307         The chain will be stopped if one of them ever returns None or the end
1308         of the chain is reached.
1309
1310         PostProcessor objects follow a "mutual registration" process similar
1311         to InfoExtractor objects.
1312         """
1313
1314         _downloader = None
1315
1316         def __init__(self, downloader=None):
1317                 self._downloader = downloader
1318
1319         def set_downloader(self, downloader):
1320                 """Sets the downloader for this PP."""
1321                 self._downloader = downloader
1322
1323         def run(self, information):
1324                 """Run the PostProcessor.
1325
1326                 The "information" argument is a dictionary like the ones
1327                 composed by InfoExtractors. The only difference is that this
1328                 one has an extra field called "filepath" that points to the
1329                 downloaded file.
1330
1331                 When this method returns None, the postprocessing chain is
1332                 stopped. However, this method may return an information
1333                 dictionary that will be passed to the next postprocessing
1334                 object in the chain. It can be the one it received after
1335                 changing some fields.
1336
1337                 In addition, this method may raise a PostProcessingError
1338                 exception that will be taken into account by the downloader
1339                 it was called from.
1340                 """
1341                 return information # by default, do nothing
1342
1343 ### MAIN PROGRAM ###
1344 if __name__ == '__main__':
1345         try:
1346                 # Modules needed only when running the main program
1347                 import getpass
1348                 import optparse
1349
1350                 # Function to update the program file with the latest version from bitbucket.org
1351                 def update_self(downloader, filename):
1352                         # Note: downloader only used for options
1353                         if not os.access (filename, os.W_OK):
1354                                 sys.exit('ERROR: no write permissions on %s' % filename)
1355
1356                         downloader.to_stdout('Updating to latest stable version...')
1357                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1358                         latest_version = urllib.urlopen(latest_url).read().strip()
1359                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1360                         newcontent = urllib.urlopen(prog_url).read()
1361                         stream = open(filename, 'w')
1362                         stream.write(newcontent)
1363                         stream.close()
1364                         downloader.to_stdout('Updated to version %s' % latest_version)
1365
1366                 # General configuration
1367                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1368                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1369                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1370
1371                 # Parse command line
1372                 parser = optparse.OptionParser(
1373                         usage='Usage: %prog [options] url...',
1374                         version='2010.01.06',
1375                         conflict_handler='resolve',
1376                 )
1377
1378                 parser.add_option('-h', '--help',
1379                                 action='help', help='print this help text and exit')
1380                 parser.add_option('-v', '--version',
1381                                 action='version', help='print program version and exit')
1382                 parser.add_option('-U', '--update',
1383                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1384                 parser.add_option('-i', '--ignore-errors',
1385                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1386                 parser.add_option('-r', '--rate-limit',
1387                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1388
1389                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1390                 authentication.add_option('-u', '--username',
1391                                 dest='username', metavar='UN', help='account username')
1392                 authentication.add_option('-p', '--password',
1393                                 dest='password', metavar='PW', help='account password')
1394                 authentication.add_option('-n', '--netrc',
1395                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1396                 parser.add_option_group(authentication)
1397
1398                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1399                 video_format.add_option('-f', '--format',
1400                                 action='store', dest='format', metavar='FMT', help='video format code')
1401                 video_format.add_option('-b', '--best-quality',
1402                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1403                 video_format.add_option('-m', '--mobile-version',
1404                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1405                 video_format.add_option('-d', '--high-def',
1406                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1407                 parser.add_option_group(video_format)
1408
1409                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1410                 verbosity.add_option('-q', '--quiet',
1411                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1412                 verbosity.add_option('-s', '--simulate',
1413                                 action='store_true', dest='simulate', help='do not download video', default=False)
1414                 verbosity.add_option('-g', '--get-url',
1415                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1416                 verbosity.add_option('-e', '--get-title',
1417                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1418                 parser.add_option_group(verbosity)
1419
1420                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1421                 filesystem.add_option('-t', '--title',
1422                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1423                 filesystem.add_option('-l', '--literal',
1424                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1425                 filesystem.add_option('-o', '--output',
1426                                 dest='outtmpl', metavar='TPL', help='output filename template')
1427                 filesystem.add_option('-a', '--batch-file',
1428                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1429                 filesystem.add_option('-w', '--no-overwrites',
1430                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1431                 filesystem.add_option('-c', '--continue',
1432                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1433                 parser.add_option_group(filesystem)
1434
1435                 (opts, args) = parser.parse_args()
1436
1437                 # Batch file verification
1438                 batchurls = []
1439                 if opts.batchfile is not None:
1440                         try:
1441                                 batchurls = open(opts.batchfile, 'r').readlines()
1442                                 batchurls = [x.strip() for x in batchurls]
1443                                 batchurls = [x for x in batchurls if len(x) > 0]
1444                         except IOError:
1445                                 sys.exit(u'ERROR: batch file could not be read')
1446                 all_urls = batchurls + args
1447
1448                 # Conflicting, missing and erroneous options
1449                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1450                         parser.error(u'using .netrc conflicts with giving username/password')
1451                 if opts.password is not None and opts.username is None:
1452                         parser.error(u'account username missing')
1453                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1454                         parser.error(u'using output template conflicts with using title or literal title')
1455                 if opts.usetitle and opts.useliteral:
1456                         parser.error(u'using title conflicts with using literal title')
1457                 if opts.username is not None and opts.password is None:
1458                         opts.password = getpass.getpass(u'Type account password and press return:')
1459                 if opts.ratelimit is not None:
1460                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1461                         if numeric_limit is None:
1462                                 parser.error(u'invalid rate limit specified')
1463                         opts.ratelimit = numeric_limit
1464
1465                 # Information extractors
1466                 youtube_ie = YoutubeIE()
1467                 metacafe_ie = MetacafeIE(youtube_ie)
1468                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1469                 youtube_user_ie = YoutubeUserIE(youtube_ie)
1470                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1471                 google_ie = GoogleIE()
1472                 photobucket_ie = PhotobucketIE()
1473
1474                 # File downloader
1475                 fd = FileDownloader({
1476                         'usenetrc': opts.usenetrc,
1477                         'username': opts.username,
1478                         'password': opts.password,
1479                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1480                         'forceurl': opts.geturl,
1481                         'forcetitle': opts.gettitle,
1482                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1483                         'format': opts.format,
1484                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1485                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1486                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1487                                 or u'%(id)s.%(ext)s'),
1488                         'ignoreerrors': opts.ignoreerrors,
1489                         'ratelimit': opts.ratelimit,
1490                         'nooverwrites': opts.nooverwrites,
1491                         'continuedl': opts.continue_dl,
1492                         })
1493                 fd.add_info_extractor(youtube_search_ie)
1494                 fd.add_info_extractor(youtube_pl_ie)
1495                 fd.add_info_extractor(youtube_user_ie)
1496                 fd.add_info_extractor(metacafe_ie)
1497                 fd.add_info_extractor(youtube_ie)
1498                 fd.add_info_extractor(google_ie)
1499                 fd.add_info_extractor(photobucket_ie)
1500
1501                 # Update version
1502                 if opts.update_self:
1503                         update_self(fd, sys.argv[0])
1504
1505                 # Maybe do nothing
1506                 if len(all_urls) < 1:
1507                         if not opts.update_self:
1508                                 parser.error(u'you must provide at least one URL')
1509                         else:
1510                                 sys.exit()
1511                 retcode = fd.download(all_urls)
1512                 sys.exit(retcode)
1513
1514         except DownloadError:
1515                 sys.exit(1)
1516         except SameFileError:
1517                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1518         except KeyboardInterrupt:
1519                 sys.exit(u'\nERROR: Interrupted by user')