git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         continuedl:     Try to continue downloads if possible.
 197         noprogress:     Do not print the progress bar.
 198         """
 199
 200         params = None
 201         _ies = []
 202         _pps = []
 203         _download_retcode = None
 204         _num_downloads = None
 205
 206         def __init__(self, params):
 207                 """Create a FileDownloader object with the given options."""
 208                 self._ies = []
 209                 self._pps = []
 210                 self._download_retcode = 0
 211                 self._num_downloads = 0
 212                 self.params = params
 213
 214         @staticmethod
 215         def pmkdir(filename):
 216                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 217                 components = filename.split(os.sep)
 218                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 219                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 220                 for dir in aggregate:
 221                         if not os.path.exists(dir):
 222                                 os.mkdir(dir)
 223
 224         @staticmethod
 225         def format_bytes(bytes):
 226                 if bytes is None:
 227                         return 'N/A'
 228                 if type(bytes) is str:
 229                         bytes = float(bytes)
 230                 if bytes == 0.0:
 231                         exponent = 0
 232                 else:
 233                         exponent = long(math.log(bytes, 1024.0))
 234                 suffix = 'bkMGTPEZY'[exponent]
 235                 converted = float(bytes) / float(1024**exponent)
 236                 return '%.2f%s' % (converted, suffix)
 237
 238         @staticmethod
 239         def calc_percent(byte_counter, data_len):
 240                 if data_len is None:
 241                         return '---.-%'
 242                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 243
 244         @staticmethod
 245         def calc_eta(start, now, total, current):
 246                 if total is None:
 247                         return '--:--'
 248                 dif = now - start
 249                 if current == 0 or dif < 0.001: # One millisecond
 250                         return '--:--'
 251                 rate = float(current) / dif
 252                 eta = long((float(total) - float(current)) / rate)
 253                 (eta_mins, eta_secs) = divmod(eta, 60)
 254                 if eta_mins > 99:
 255                         return '--:--'
 256                 return '%02d:%02d' % (eta_mins, eta_secs)
 257
 258         @staticmethod
 259         def calc_speed(start, now, bytes):
 260                 dif = now - start
 261                 if bytes == 0 or dif < 0.001: # One millisecond
 262                         return '%10s' % '---b/s'
 263                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 264
 265         @staticmethod
 266         def best_block_size(elapsed_time, bytes):
 267                 new_min = max(bytes / 2.0, 1.0)
 268                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 269                 if elapsed_time < 0.001:
 270                         return long(new_max)
 271                 rate = bytes / elapsed_time
 272                 if rate > new_max:
 273                         return long(new_max)
 274                 if rate < new_min:
 275                         return long(new_min)
 276                 return long(rate)
 277
 278         @staticmethod
 279         def parse_bytes(bytestr):
 280                 """Parse a string indicating a byte quantity into a long integer."""
 281                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 282                 if matchobj is None:
 283                         return None
 284                 number = float(matchobj.group(1))
 285                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 286                 return long(round(number * multiplier))
 287
 288         @staticmethod
 289         def verify_url(url):
 290                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 291                 request = urllib2.Request(url, None, std_headers)
 292                 data = urllib2.urlopen(request)
 293                 data.read(1)
 294                 url = data.geturl()
 295                 data.close()
 296                 return url
 297
 298         def add_info_extractor(self, ie):
 299                 """Add an InfoExtractor object to the end of the list."""
 300                 self._ies.append(ie)
 301                 ie.set_downloader(self)
 302
 303         def add_post_processor(self, pp):
 304                 """Add a PostProcessor object to the end of the chain."""
 305                 self._pps.append(pp)
 306                 pp.set_downloader(self)
 307
 308         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 309                 """Print message to stdout if not in quiet mode."""
 310                 try:
 311                         if not self.params.get('quiet', False):
 312                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 313                         sys.stdout.flush()
 314                 except (UnicodeEncodeError), err:
 315                         if not ignore_encoding_errors:
 316                                 raise
 317
 318         def to_stderr(self, message):
 319                 """Print message to stderr."""
 320                 print >>sys.stderr, message.encode(preferredencoding())
 321
 322         def fixed_template(self):
 323                 """Checks if the output template is fixed."""
 324                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 325
 326         def trouble(self, message=None):
 327                 """Determine action to take when a download problem appears.
 328
 329                 Depending on if the downloader has been configured to ignore
 330                 download errors or not, this method may throw an exception or
 331                 not when errors are found, after printing the message.
 332                 """
 333                 if message is not None:
 334                         self.to_stderr(message)
 335                 if not self.params.get('ignoreerrors', False):
 336                         raise DownloadError(message)
 337                 self._download_retcode = 1
 338
 339         def slow_down(self, start_time, byte_counter):
 340                 """Sleep if the download speed is over the rate limit."""
 341                 rate_limit = self.params.get('ratelimit', None)
 342                 if rate_limit is None or byte_counter == 0:
 343                         return
 344                 now = time.time()
 345                 elapsed = now - start_time
 346                 if elapsed <= 0.0:
 347                         return
 348                 speed = float(byte_counter) / elapsed
 349                 if speed > rate_limit:
 350                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 351
 352         def report_destination(self, filename):
 353                 """Report destination filename."""
 354                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 355
 356         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 357                 """Report download progress."""
 358                 if self.params.get('noprogress', False):
 359                         return
 360                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 361                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 362
 363         def report_resuming_byte(self, resume_len):
 364                 """Report attemtp to resume at given byte."""
 365                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 366
 367         def report_file_already_downloaded(self, file_name):
 368                 """Report file has already been fully downloaded."""
 369                 try:
 370                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 371                 except (UnicodeEncodeError), err:
 372                         self.to_stdout(u'[download] The file has already been downloaded')
 373
 374         def report_unable_to_resume(self):
 375                 """Report it was impossible to resume download."""
 376                 self.to_stdout(u'[download] Unable to resume')
 377
 378         def report_finish(self):
 379                 """Report download finished."""
 380                 if self.params.get('noprogress', False):
 381                         self.to_stdout(u'[download] Download completed')
 382                 else:
 383                         self.to_stdout(u'')
 384
 385         def process_info(self, info_dict):
 386                 """Process a single dictionary returned by an InfoExtractor."""
 387                 # Do nothing else if in simulate mode
 388                 if self.params.get('simulate', False):
 389                         # Verify URL if it's an HTTP one
 390                         if info_dict['url'].startswith('http'):
 391                                 try:
 392                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 393                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 394                                         raise UnavailableFormatError
 395
 396                         # Forced printings
 397                         if self.params.get('forcetitle', False):
 398                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 399                         if self.params.get('forceurl', False):
 400                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 401                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 402                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 403                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 404                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 405
 406                         return
 407
 408                 try:
 409                         template_dict = dict(info_dict)
 410                         template_dict['epoch'] = unicode(long(time.time()))
 411                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 412                         filename = self.params['outtmpl'] % template_dict
 413                 except (ValueError, KeyError), err:
 414                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 415                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 416                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 417                         return
 418
 419                 try:
 420                         self.pmkdir(filename)
 421                 except (OSError, IOError), err:
 422                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 423                         return
 424
 425                 try:
 426                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 427                 except (OSError, IOError), err:
 428                         raise UnavailableFormatError
 429                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 430                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 431                         return
 432                 except (ContentTooShortError, ), err:
 433                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 434                         return
 435
 436                 if success:
 437                         try:
 438                                 self.post_process(filename, info_dict)
 439                         except (PostProcessingError), err:
 440                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 441                                 return
 442
 443         def download(self, url_list):
 444                 """Download a given list of URLs."""
 445                 if len(url_list) > 1 and self.fixed_template():
 446                         raise SameFileError(self.params['outtmpl'])
 447
 448                 for url in url_list:
 449                         suitable_found = False
 450                         for ie in self._ies:
 451                                 # Go to next InfoExtractor if not suitable
 452                                 if not ie.suitable(url):
 453                                         continue
 454
 455                                 # Suitable InfoExtractor found
 456                                 suitable_found = True
 457
 458                                 # Extract information from URL and process it
 459                                 ie.extract(url)
 460
 461                                 # Suitable InfoExtractor had been found; go to next URL
 462                                 break
 463
 464                         if not suitable_found:
 465                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 466
 467                 return self._download_retcode
 468
 469         def post_process(self, filename, ie_info):
 470                 """Run the postprocessing chain on the given file."""
 471                 info = dict(ie_info)
 472                 info['filepath'] = filename
 473                 for pp in self._pps:
 474                         info = pp.run(info)
 475                         if info is None:
 476                                 break
 477
 478         def _download_with_rtmpdump(self, filename, url):
 479                 self.report_destination(filename)
 480
 481                 # Check for rtmpdump first
 482                 try:
 483                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 484                 except (OSError, IOError):
 485                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 486                         return False
 487
 488                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 489                 # the connection was interrumpted and resuming appears to be
 490                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 491                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 492                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 493                 while retval == 2 or retval == 1:
 494                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 495                         time.sleep(2.0) # This seems to be needed
 496                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 497                 if retval == 0:
 498                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 499                         return True
 500                 else:
 501                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 502                         return False
 503
 504         def _do_download(self, filename, url):
 505                 # Attempt to download using rtmpdump
 506                 if url.startswith('rtmp'):
 507                         return self._download_with_rtmpdump(filename, url)
 508
 509                 stream = None
 510                 open_mode = 'wb'
 511                 basic_request = urllib2.Request(url, None, std_headers)
 512                 request = urllib2.Request(url, None, std_headers)
 513
 514                 # Establish possible resume length
 515                 if os.path.isfile(filename):
 516                         resume_len = os.path.getsize(filename)
 517                 else:
 518                         resume_len = 0
 519
 520                 # Request parameters in case of being able to resume
 521                 if self.params.get('continuedl', False) and resume_len != 0:
 522                         self.report_resuming_byte(resume_len)
 523                         request.add_header('Range','bytes=%d-' % resume_len)
 524                         open_mode = 'ab'
 525
 526                 # Establish connection
 527                 try:
 528                         data = urllib2.urlopen(request)
 529                 except (urllib2.HTTPError, ), err:
 530                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 531                                 raise
 532                         # Unable to resume
 533                         data = urllib2.urlopen(basic_request)
 534                         content_length = data.info()['Content-Length']
 535
 536                         if content_length is not None and long(content_length) == resume_len:
 537                                 # Because the file had already been fully downloaded
 538                                 self.report_file_already_downloaded(filename)
 539                                 return True
 540                         else:
 541                                 # Because the server didn't let us
 542                                 self.report_unable_to_resume()
 543                                 open_mode = 'wb'
 544
 545                 data_len = data.info().get('Content-length', None)
 546                 data_len_str = self.format_bytes(data_len)
 547                 byte_counter = 0
 548                 block_size = 1024
 549                 start = time.time()
 550                 while True:
 551                         # Download and write
 552                         before = time.time()
 553                         data_block = data.read(block_size)
 554                         after = time.time()
 555                         data_block_len = len(data_block)
 556                         if data_block_len == 0:
 557                                 break
 558                         byte_counter += data_block_len
 559
 560                         # Open file just in time
 561                         if stream is None:
 562                                 try:
 563                                         (stream, filename) = sanitize_open(filename, open_mode)
 564                                         self.report_destination(filename)
 565                                         self._num_downloads += 1
 566                                 except (OSError, IOError), err:
 567                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 568                                         return False
 569                         try:
 570                                 stream.write(data_block)
 571                         except (IOError, OSError), err:
 572                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 573                         block_size = self.best_block_size(after - before, data_block_len)
 574
 575                         # Progress message
 576                         percent_str = self.calc_percent(byte_counter, data_len)
 577                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 578                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 579                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 580
 581                         # Apply rate limit
 582                         self.slow_down(start, byte_counter)
 583
 584                 self.report_finish()
 585                 if data_len is not None and str(byte_counter) != data_len:
 586                         raise ContentTooShortError(byte_counter, long(data_len))
 587                 return True
 588
 589 class InfoExtractor(object):
 590         """Information Extractor class.
 591
 592         Information extractors are the classes that, given a URL, extract
 593         information from the video (or videos) the URL refers to. This
 594         information includes the real video URL, the video title and simplified
 595         title, author and others. The information is stored in a dictionary
 596         which is then passed to the FileDownloader. The FileDownloader
 597         processes this information possibly downloading the video to the file
 598         system, among other possible outcomes. The dictionaries must include
 599         the following fields:
 600
 601         id:             Video identifier.
 602         url:            Final video URL.
 603         uploader:       Nickname of the video uploader.
 604         title:          Literal title.
 605         stitle:         Simplified title.
 606         ext:            Video filename extension.
 607         format:         Video format.
 608
 609         The following fields are optional. Their primary purpose is to allow
 610         youtube-dl to serve as the backend for a video search function, such
 611         as the one in youtube2mp3.  They are only used when their respective
 612         forced printing functions are called:
 613
 614         thumbnail:      Full URL to a video thumbnail image.
 615         description:    One-line video description.
 616
 617         Subclasses of this one should re-define the _real_initialize() and
 618         _real_extract() methods, as well as the suitable() static method.
 619         Probably, they should also be instantiated and added to the main
 620         downloader.
 621         """
 622
 623         _ready = False
 624         _downloader = None
 625
 626         def __init__(self, downloader=None):
 627                 """Constructor. Receives an optional downloader."""
 628                 self._ready = False
 629                 self.set_downloader(downloader)
 630
 631         @staticmethod
 632         def suitable(url):
 633                 """Receives a URL and returns True if suitable for this IE."""
 634                 return False
 635
 636         def initialize(self):
 637                 """Initializes an instance (authentication, etc)."""
 638                 if not self._ready:
 639                         self._real_initialize()
 640                         self._ready = True
 641
 642         def extract(self, url):
 643                 """Extracts URL information and returns it in list of dicts."""
 644                 self.initialize()
 645                 return self._real_extract(url)
 646
 647         def set_downloader(self, downloader):
 648                 """Sets the downloader for this IE."""
 649                 self._downloader = downloader
 650
 651         def _real_initialize(self):
 652                 """Real initialization process. Redefine in subclasses."""
 653                 pass
 654
 655         def _real_extract(self, url):
 656                 """Real extraction process. Redefine in subclasses."""
 657                 pass
 658
 659 class YoutubeIE(InfoExtractor):
 660         """Information extractor for youtube.com."""
 661
 662         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 663         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 664         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 665         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 666         _NETRC_MACHINE = 'youtube'
 667         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 668         _video_extensions = {
 669                 '13': '3gp',
 670                 '17': 'mp4',
 671                 '18': 'mp4',
 672                 '22': 'mp4',
 673                 '37': 'mp4',
 674         }
 675
 676         @staticmethod
 677         def suitable(url):
 678                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 679
 680         def report_lang(self):
 681                 """Report attempt to set language."""
 682                 self._downloader.to_stdout(u'[youtube] Setting language')
 683
 684         def report_login(self):
 685                 """Report attempt to log in."""
 686                 self._downloader.to_stdout(u'[youtube] Logging in')
 687
 688         def report_age_confirmation(self):
 689                 """Report attempt to confirm age."""
 690                 self._downloader.to_stdout(u'[youtube] Confirming age')
 691
 692         def report_video_info_webpage_download(self, video_id):
 693                 """Report attempt to download video info webpage."""
 694                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 695
 696         def report_information_extraction(self, video_id):
 697                 """Report attempt to extract video information."""
 698                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 699
 700         def report_unavailable_format(self, video_id, format):
 701                 """Report extracted video URL."""
 702                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 703
 704         def report_rtmp_download(self):
 705                 """Indicate the download will use the RTMP protocol."""
 706                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 707
 708         def _real_initialize(self):
 709                 if self._downloader is None:
 710                         return
 711
 712                 username = None
 713                 password = None
 714                 downloader_params = self._downloader.params
 715
 716                 # Attempt to use provided username and password or .netrc data
 717                 if downloader_params.get('username', None) is not None:
 718                         username = downloader_params['username']
 719                         password = downloader_params['password']
 720                 elif downloader_params.get('usenetrc', False):
 721                         try:
 722                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 723                                 if info is not None:
 724                                         username = info[0]
 725                                         password = info[2]
 726                                 else:
 727                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 728                         except (IOError, netrc.NetrcParseError), err:
 729                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 730                                 return
 731
 732                 # Set language
 733                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 734                 try:
 735                         self.report_lang()
 736                         urllib2.urlopen(request).read()
 737                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 738                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 739                         return
 740
 741                 # No authentication to be performed
 742                 if username is None:
 743                         return
 744
 745                 # Log in
 746                 login_form = {
 747                                 'current_form': 'loginForm',
 748                                 'next':         '/',
 749                                 'action_login': 'Log In',
 750                                 'username':     username,
 751                                 'password':     password,
 752                                 }
 753                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 754                 try:
 755                         self.report_login()
 756                         login_results = urllib2.urlopen(request).read()
 757                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 758                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 759                                 return
 760                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 761                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 762                         return
 763
 764                 # Confirm age
 765                 age_form = {
 766                                 'next_url':             '/',
 767                                 'action_confirm':       'Confirm',
 768                                 }
 769                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 770                 try:
 771                         self.report_age_confirmation()
 772                         age_results = urllib2.urlopen(request).read()
 773                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 774                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 775                         return
 776
 777         def _real_extract(self, url):
 778                 # Extract video id from URL
 779                 mobj = re.match(self._VALID_URL, url)
 780                 if mobj is None:
 781                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 782                         return
 783                 video_id = mobj.group(2)
 784
 785                 # Downloader parameters
 786                 best_quality = False
 787                 all_formats = False
 788                 format_param = None
 789                 quality_index = 0
 790                 if self._downloader is not None:
 791                         params = self._downloader.params
 792                         format_param = params.get('format', None)
 793                         if format_param == '0':
 794                                 format_param = self._available_formats[quality_index]
 795                                 best_quality = True
 796                         elif format_param == '-1':
 797                                 format_param = self._available_formats[quality_index]
 798                                 all_formats = True
 799
 800                 while True:
 801                         # Extension
 802                         video_extension = self._video_extensions.get(format_param, 'flv')
 803
 804                         # Get video info
 805                         self.report_video_info_webpage_download(video_id)
 806                         for el_type in ['embedded', 'detailpage', 'vevo']:
 807                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s&el=%s&ps=default&eurl=&gl=US&hl=en'
 808                                                    % (video_id, el_type))
 809                                 request = urllib2.Request(video_info_url, None, std_headers)
 810                                 try:
 811                                         video_info_webpage = urllib2.urlopen(request).read()
 812                                         video_info = parse_qs(video_info_webpage)
 813                                         if 'token' in video_info:
 814                                                 break
 815                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 816                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 817                                         return
 818                         self.report_information_extraction(video_id)
 819
 820                         # "t" param
 821                         if 'token' not in video_info:
 822                                 # Attempt to see if YouTube has issued an error message
 823                                 if 'reason' not in video_info:
 824                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 825                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 826                                         stream.write(video_info_webpage)
 827                                         stream.close()
 828                                 else:
 829                                         reason = urllib.unquote_plus(video_info['reason'][0])
 830                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 831                                 return
 832                         token = urllib.unquote_plus(video_info['token'][0])
 833                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 834                         if format_param is not None:
 835                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 836
 837                         # Check possible RTMP download
 838                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 839                                 self.report_rtmp_download()
 840                                 video_real_url = video_info['conn'][0]
 841
 842                         # uploader
 843                         if 'author' not in video_info:
 844                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 845                                 return
 846                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 847
 848                         # title
 849                         if 'title' not in video_info:
 850                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 851                                 return
 852                         video_title = urllib.unquote_plus(video_info['title'][0])
 853                         video_title = video_title.decode('utf-8')
 854                         video_title = sanitize_title(video_title)
 855
 856                         # simplified title
 857                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 858                         simple_title = simple_title.strip(ur'_')
 859
 860                         # thumbnail image
 861                         if 'thumbnail_url' not in video_info:
 862                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 863                                 video_thumbnail = ''
 864                         else:   # don't panic if we can't find it
 865                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 866
 867                         # get video description
 868                         video_description = 'No description available.'    # we need something to pass to self._downloader
 869                         # this requires an additional HTTP request and a little
 870                         # more time, so don't do it unless absolutely necessary
 871                         if self._downloader.params.get('forcedescription', False):
 872                                 video_page_url = 'http://www.youtube.com/watch?v=' + video_id
 873                                 request = urllib2.Request(video_page_url, None, std_headers)
 874                                 try:
 875                                         video_page_webpage = urllib2.urlopen(request).read()
 876                                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_page_webpage)
 877                                         if mobj is not None:
 878                                                 video_description = mobj.group(1)
 879                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 880                                         pass    # don't panic if we can't find it
 881
 882                         try:
 883                                 # Process video information
 884                                 self._downloader.process_info({
 885                                         'id':           video_id.decode('utf-8'),
 886                                         'url':          video_real_url.decode('utf-8'),
 887                                         'uploader':     video_uploader.decode('utf-8'),
 888                                         'title':        video_title,
 889                                         'stitle':       simple_title,
 890                                         'ext':          video_extension.decode('utf-8'),
 891                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 892                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 893                                         'description':  video_description.decode('utf-8'),
 894                                 })
 895
 896                                 if all_formats:
 897                                         if quality_index == len(self._available_formats):
 898                                                 # None left to get
 899                                                 return
 900                                         else:
 901                                                 quality_index += 1
 902                                                 format_param = self._available_formats[quality_index]
 903                                                 continue
 904                                 return
 905
 906                         except UnavailableFormatError, err:
 907                                 if best_quality or all_formats:
 908                                         if quality_index == len(self._available_formats):
 909                                                 # I don't ever expect this to happen
 910                                                 if not all_formats:
 911                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 912                                                 return
 913                                         else:
 914                                                 self.report_unavailable_format(video_id, format_param)
 915                                                 quality_index += 1
 916                                                 format_param = self._available_formats[quality_index]
 917                                                 continue
 918                                 else:
 919                                         self._downloader.trouble('ERROR: format not available for video')
 920                                         return
 921
 922
 923 class MetacafeIE(InfoExtractor):
 924         """Information Extractor for metacafe.com."""
 925
 926         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 927         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 928         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 929         _youtube_ie = None
 930
 931         def __init__(self, youtube_ie, downloader=None):
 932                 InfoExtractor.__init__(self, downloader)
 933                 self._youtube_ie = youtube_ie
 934
 935         @staticmethod
 936         def suitable(url):
 937                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 938
 939         def report_disclaimer(self):
 940                 """Report disclaimer retrieval."""
 941                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 942
 943         def report_age_confirmation(self):
 944                 """Report attempt to confirm age."""
 945                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 946
 947         def report_download_webpage(self, video_id):
 948                 """Report webpage download."""
 949                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 950
 951         def report_extraction(self, video_id):
 952                 """Report information extraction."""
 953                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 954
 955         def _real_initialize(self):
 956                 # Retrieve disclaimer
 957                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 958                 try:
 959                         self.report_disclaimer()
 960                         disclaimer = urllib2.urlopen(request).read()
 961                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 962                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 963                         return
 964
 965                 # Confirm age
 966                 disclaimer_form = {
 967                         'filters': '0',
 968                         'submit': "Continue - I'm over 18",
 969                         }
 970                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 971                 try:
 972                         self.report_age_confirmation()
 973                         disclaimer = urllib2.urlopen(request).read()
 974                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 975                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 976                         return
 977
 978         def _real_extract(self, url):
 979                 # Extract id and simplified title from URL
 980                 mobj = re.match(self._VALID_URL, url)
 981                 if mobj is None:
 982                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 983                         return
 984
 985                 video_id = mobj.group(1)
 986
 987                 # Check if video comes from YouTube
 988                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 989                 if mobj2 is not None:
 990                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 991                         return
 992
 993                 simple_title = mobj.group(2).decode('utf-8')
 994                 video_extension = 'flv'
 995
 996                 # Retrieve video webpage to extract further information
 997                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 998                 try:
 999                         self.report_download_webpage(video_id)
1000                         webpage = urllib2.urlopen(request).read()
1001                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1003                         return
1004
1005                 # Extract URL, uploader and title from webpage
1006                 self.report_extraction(video_id)
1007                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1008                 if mobj is None:
1009                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1010                         return
1011                 mediaURL = urllib.unquote(mobj.group(1))
1012
1013                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1014                 #if mobj is None:
1015                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1016                 #       return
1017                 #gdaKey = mobj.group(1)
1018                 #
1019                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1020
1021                 video_url = mediaURL
1022
1023                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1024                 if mobj is None:
1025                         self._downloader.trouble(u'ERROR: unable to extract title')
1026                         return
1027                 video_title = mobj.group(1).decode('utf-8')
1028                 video_title = sanitize_title(video_title)
1029
1030                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1031                 if mobj is None:
1032                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1033                         return
1034                 video_uploader = mobj.group(1)
1035
1036                 try:
1037                         # Process video information
1038                         self._downloader.process_info({
1039                                 'id':           video_id.decode('utf-8'),
1040                                 'url':          video_url.decode('utf-8'),
1041                                 'uploader':     video_uploader.decode('utf-8'),
1042                                 'title':        video_title,
1043                                 'stitle':       simple_title,
1044                                 'ext':          video_extension.decode('utf-8'),
1045                                 'format':       u'NA',
1046                         })
1047                 except UnavailableFormatError:
1048                         self._downloader.trouble(u'ERROR: format not available for video')
1049
1050
1051 class GoogleIE(InfoExtractor):
1052         """Information extractor for video.google.com."""
1053
1054         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1055
1056         def __init__(self, downloader=None):
1057                 InfoExtractor.__init__(self, downloader)
1058
1059         @staticmethod
1060         def suitable(url):
1061                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1062
1063         def report_download_webpage(self, video_id):
1064                 """Report webpage download."""
1065                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1066
1067         def report_extraction(self, video_id):
1068                 """Report information extraction."""
1069                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1070
1071         def _real_initialize(self):
1072                 return
1073
1074         def _real_extract(self, url):
1075                 # Extract id from URL
1076                 mobj = re.match(self._VALID_URL, url)
1077                 if mobj is None:
1078                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1079                         return
1080
1081                 video_id = mobj.group(1)
1082
1083                 video_extension = 'mp4'
1084
1085                 # Retrieve video webpage to extract further information
1086                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1087                 try:
1088                         self.report_download_webpage(video_id)
1089                         webpage = urllib2.urlopen(request).read()
1090                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1091                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1092                         return
1093
1094                 # Extract URL, uploader, and title from webpage
1095                 self.report_extraction(video_id)
1096                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1097                 if mobj is None:
1098                         video_extension = 'flv'
1099                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1100                 if mobj is None:
1101                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1102                         return
1103                 mediaURL = urllib.unquote(mobj.group(1))
1104                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1105                 mediaURL = mediaURL.replace('\\x26', '\x26')
1106
1107                 video_url = mediaURL
1108
1109                 mobj = re.search(r'<title>(.*)</title>', webpage)
1110                 if mobj is None:
1111                         self._downloader.trouble(u'ERROR: unable to extract title')
1112                         return
1113                 video_title = mobj.group(1).decode('utf-8')
1114                 video_title = sanitize_title(video_title)
1115                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1116
1117                 # Extract video description
1118                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1119                 if mobj is None:
1120                         self._downloader.trouble(u'ERROR: unable to extract video description')
1121                         return
1122                 video_description = mobj.group(1).decode('utf-8')
1123                 if not video_description:
1124                         video_description = 'No description available.'
1125
1126                 # Extract video thumbnail
1127                 if self._downloader.params.get('forcethumbnail', False):
1128                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1129                         try:
1130                                 webpage = urllib2.urlopen(request).read()
1131                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1132                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1133                                 return
1134                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1135                         if mobj is None:
1136                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1137                                 return
1138                         video_thumbnail = mobj.group(1)
1139                 else:   # we need something to pass to process_info
1140                         video_thumbnail = ''
1141
1142
1143                 try:
1144                         # Process video information
1145                         self._downloader.process_info({
1146                                 'id':           video_id.decode('utf-8'),
1147                                 'url':          video_url.decode('utf-8'),
1148                                 'uploader':     u'NA',
1149                                 'title':        video_title,
1150                                 'stitle':       simple_title,
1151                                 'ext':          video_extension.decode('utf-8'),
1152                                 'format':       u'NA',
1153                         })
1154                 except UnavailableFormatError:
1155                         self._downloader.trouble(u'ERROR: format not available for video')
1156
1157
1158 class PhotobucketIE(InfoExtractor):
1159         """Information extractor for photobucket.com."""
1160
1161         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1162
1163         def __init__(self, downloader=None):
1164                 InfoExtractor.__init__(self, downloader)
1165
1166         @staticmethod
1167         def suitable(url):
1168                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1169
1170         def report_download_webpage(self, video_id):
1171                 """Report webpage download."""
1172                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1173
1174         def report_extraction(self, video_id):
1175                 """Report information extraction."""
1176                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1177
1178         def _real_initialize(self):
1179                 return
1180
1181         def _real_extract(self, url):
1182                 # Extract id from URL
1183                 mobj = re.match(self._VALID_URL, url)
1184                 if mobj is None:
1185                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186                         return
1187
1188                 video_id = mobj.group(1)
1189
1190                 video_extension = 'flv'
1191
1192                 # Retrieve video webpage to extract further information
1193                 request = urllib2.Request(url)
1194                 try:
1195                         self.report_download_webpage(video_id)
1196                         webpage = urllib2.urlopen(request).read()
1197                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1198                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1199                         return
1200
1201                 # Extract URL, uploader, and title from webpage
1202                 self.report_extraction(video_id)
1203                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1204                 if mobj is None:
1205                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1206                         return
1207                 mediaURL = urllib.unquote(mobj.group(1))
1208
1209                 video_url = mediaURL
1210
1211                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1212                 if mobj is None:
1213                         self._downloader.trouble(u'ERROR: unable to extract title')
1214                         return
1215                 video_title = mobj.group(1).decode('utf-8')
1216                 video_title = sanitize_title(video_title)
1217                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1218
1219                 video_uploader = mobj.group(2).decode('utf-8')
1220
1221                 try:
1222                         # Process video information
1223                         self._downloader.process_info({
1224                                 'id':           video_id.decode('utf-8'),
1225                                 'url':          video_url.decode('utf-8'),
1226                                 'uploader':     video_uploader,
1227                                 'title':        video_title,
1228                                 'stitle':       simple_title,
1229                                 'ext':          video_extension.decode('utf-8'),
1230                                 'format':       u'NA',
1231                         })
1232                 except UnavailableFormatError:
1233                         self._downloader.trouble(u'ERROR: format not available for video')
1234
1235
1236 class YahooIE(InfoExtractor):
1237         """Information extractor for video.yahoo.com."""
1238
1239         # _VALID_URL matches all Yahoo! Video URLs
1240         # _VPAGE_URL matches only the extractable '/watch/' URLs
1241         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1242         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1243
1244         def __init__(self, downloader=None):
1245                 InfoExtractor.__init__(self, downloader)
1246
1247         @staticmethod
1248         def suitable(url):
1249                 return (re.match(YahooIE._VALID_URL, url) is not None)
1250
1251         def report_download_webpage(self, video_id):
1252                 """Report webpage download."""
1253                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1254
1255         def report_extraction(self, video_id):
1256                 """Report information extraction."""
1257                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1258
1259         def _real_initialize(self):
1260                 return
1261
1262         def _real_extract(self, url):
1263                 # Extract ID from URL
1264                 mobj = re.match(self._VALID_URL, url)
1265                 if mobj is None:
1266                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1267                         return
1268
1269                 video_id = mobj.group(2)
1270                 video_extension = 'flv'
1271
1272                 # Rewrite valid but non-extractable URLs as
1273                 # extractable English language /watch/ URLs
1274                 if re.match(self._VPAGE_URL, url) is None:
1275                         request = urllib2.Request(url)
1276                         try:
1277                                 webpage = urllib2.urlopen(request).read()
1278                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1280                                 return
1281
1282                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1283                         if mobj is None:
1284                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1285                                 return
1286                         yahoo_id = mobj.group(1)
1287
1288                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1289                         if mobj is None:
1290                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1291                                 return
1292                         yahoo_vid = mobj.group(1)
1293
1294                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1295                         return self._real_extract(url)
1296
1297                 # Retrieve video webpage to extract further information
1298                 request = urllib2.Request(url)
1299                 try:
1300                         self.report_download_webpage(video_id)
1301                         webpage = urllib2.urlopen(request).read()
1302                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1303                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1304                         return
1305
1306                 # Extract uploader and title from webpage
1307                 self.report_extraction(video_id)
1308                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1309                 if mobj is None:
1310                         self._downloader.trouble(u'ERROR: unable to extract video title')
1311                         return
1312                 video_title = mobj.group(1).decode('utf-8')
1313                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1314
1315                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1316                 if mobj is None:
1317                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1318                         return
1319                 video_uploader = mobj.group(1).decode('utf-8')
1320
1321                 # Extract video thumbnail
1322                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1323                 if mobj is None:
1324                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1325                         return
1326                 video_thumbnail = mobj.group(1).decode('utf-8')
1327
1328                 # Extract video description
1329                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1330                 if mobj is None:
1331                         self._downloader.trouble(u'ERROR: unable to extract video description')
1332                         return
1333                 video_description = mobj.group(1).decode('utf-8')
1334                 if not video_description: video_description = 'No description available.'
1335
1336                 # Extract video height and width
1337                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1338                 if mobj is None:
1339                         self._downloader.trouble(u'ERROR: unable to extract video height')
1340                         return
1341                 yv_video_height = mobj.group(1)
1342
1343                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1344                 if mobj is None:
1345                         self._downloader.trouble(u'ERROR: unable to extract video width')
1346                         return
1347                 yv_video_width = mobj.group(1)
1348
1349                 # Retrieve video playlist to extract media URL
1350                 # I'm not completely sure what all these options are, but we
1351                 # seem to need most of them, otherwise the server sends a 401.
1352                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1353                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1354                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1355                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1356                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1357                 try:
1358                         self.report_download_webpage(video_id)
1359                         webpage = urllib2.urlopen(request).read()
1360                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1361                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1362                         return
1363
1364                 # Extract media URL from playlist XML
1365                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1366                 if mobj is None:
1367                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1368                         return
1369                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1370                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1371
1372                 try:
1373                         # Process video information
1374                         self._downloader.process_info({
1375                                 'id':           video_id.decode('utf-8'),
1376                                 'url':          video_url,
1377                                 'uploader':     video_uploader,
1378                                 'title':        video_title,
1379                                 'stitle':       simple_title,
1380                                 'ext':          video_extension.decode('utf-8'),
1381                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1382                                 'description':  video_description,
1383                                 'thumbnail':    video_thumbnail,
1384                                 'description':  video_description,
1385                         })
1386                 except UnavailableFormatError:
1387                         self._downloader.trouble(u'ERROR: format not available for video')
1388
1389
1390 class GenericIE(InfoExtractor):
1391         """Generic last-resort information extractor."""
1392
1393         def __init__(self, downloader=None):
1394                 InfoExtractor.__init__(self, downloader)
1395
1396         @staticmethod
1397         def suitable(url):
1398                 return True
1399
1400         def report_download_webpage(self, video_id):
1401                 """Report webpage download."""
1402                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1403                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1404
1405         def report_extraction(self, video_id):
1406                 """Report information extraction."""
1407                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1408
1409         def _real_initialize(self):
1410                 return
1411
1412         def _real_extract(self, url):
1413                 video_id = url.split('/')[-1]
1414                 request = urllib2.Request(url)
1415                 try:
1416                         self.report_download_webpage(video_id)
1417                         webpage = urllib2.urlopen(request).read()
1418                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1419                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1420                         return
1421                 except ValueError, err:
1422                         # since this is the last-resort InfoExtractor, if
1423                         # this error is thrown, it'll be thrown here
1424                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1425                         return
1426
1427                 # Start with something easy: JW Player in SWFObject
1428                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1429                 if mobj is None:
1430                         # Broaden the search a little bit
1431                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1432                 if mobj is None:
1433                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1434                         return
1435
1436                 # It's possible that one of the regexes
1437                 # matched, but returned an empty group:
1438                 if mobj.group(1) is None:
1439                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1440                         return
1441
1442                 video_url = urllib.unquote(mobj.group(1))
1443                 video_id  = os.path.basename(video_url)
1444
1445                 # here's a fun little line of code for you:
1446                 video_extension = os.path.splitext(video_id)[1][1:]
1447                 video_id        = os.path.splitext(video_id)[0]
1448
1449                 # it's tempting to parse this further, but you would
1450                 # have to take into account all the variations like
1451                 #   Video Title - Site Name
1452                 #   Site Name | Video Title
1453                 #   Video Title - Tagline | Site Name
1454                 # and so on and so forth; it's just not practical
1455                 mobj = re.search(r'<title>(.*)</title>', webpage)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: unable to extract title')
1458                         return
1459                 video_title = mobj.group(1).decode('utf-8')
1460                 video_title = sanitize_title(video_title)
1461                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1462
1463                 # video uploader is domain name
1464                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1465                 if mobj is None:
1466                         self._downloader.trouble(u'ERROR: unable to extract title')
1467                         return
1468                 video_uploader = mobj.group(1).decode('utf-8')
1469
1470                 try:
1471                         # Process video information
1472                         self._downloader.process_info({
1473                                 'id':           video_id.decode('utf-8'),
1474                                 'url':          video_url.decode('utf-8'),
1475                                 'uploader':     video_uploader,
1476                                 'title':        video_title,
1477                                 'stitle':       simple_title,
1478                                 'ext':          video_extension.decode('utf-8'),
1479                                 'format':       u'NA',
1480                         })
1481                 except UnavailableFormatError:
1482                         self._downloader.trouble(u'ERROR: format not available for video')
1483
1484
1485 class YoutubeSearchIE(InfoExtractor):
1486         """Information Extractor for YouTube search queries."""
1487         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1488         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1489         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1490         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1491         _youtube_ie = None
1492         _max_youtube_results = 1000
1493
1494         def __init__(self, youtube_ie, downloader=None):
1495                 InfoExtractor.__init__(self, downloader)
1496                 self._youtube_ie = youtube_ie
1497
1498         @staticmethod
1499         def suitable(url):
1500                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1501
1502         def report_download_page(self, query, pagenum):
1503                 """Report attempt to download playlist page with given number."""
1504                 query = query.decode(preferredencoding())
1505                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1506
1507         def _real_initialize(self):
1508                 self._youtube_ie.initialize()
1509
1510         def _real_extract(self, query):
1511                 mobj = re.match(self._VALID_QUERY, query)
1512                 if mobj is None:
1513                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1514                         return
1515
1516                 prefix, query = query.split(':')
1517                 prefix = prefix[8:]
1518                 query  = query.encode('utf-8')
1519                 if prefix == '':
1520                         self._download_n_results(query, 1)
1521                         return
1522                 elif prefix == 'all':
1523                         self._download_n_results(query, self._max_youtube_results)
1524                         return
1525                 else:
1526                         try:
1527                                 n = long(prefix)
1528                                 if n <= 0:
1529                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1530                                         return
1531                                 elif n > self._max_youtube_results:
1532                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1533                                         n = self._max_youtube_results
1534                                 self._download_n_results(query, n)
1535                                 return
1536                         except ValueError: # parsing prefix as integer fails
1537                                 self._download_n_results(query, 1)
1538                                 return
1539
1540         def _download_n_results(self, query, n):
1541                 """Downloads a specified number of results for a query"""
1542
1543                 video_ids = []
1544                 already_seen = set()
1545                 pagenum = 1
1546
1547                 while True:
1548                         self.report_download_page(query, pagenum)
1549                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1550                         request = urllib2.Request(result_url, None, std_headers)
1551                         try:
1552                                 page = urllib2.urlopen(request).read()
1553                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1554                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1555                                 return
1556
1557                         # Extract video identifiers
1558                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1559                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1560                                 if video_id not in already_seen:
1561                                         video_ids.append(video_id)
1562                                         already_seen.add(video_id)
1563                                         if len(video_ids) == n:
1564                                                 # Specified n videos reached
1565                                                 for id in video_ids:
1566                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1567                                                 return
1568
1569                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1570                                 for id in video_ids:
1571                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1572                                 return
1573
1574                         pagenum = pagenum + 1
1575
1576 class GoogleSearchIE(InfoExtractor):
1577         """Information Extractor for Google Video search queries."""
1578         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1579         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1580         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1581         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1582         _google_ie = None
1583         _max_google_results = 1000
1584
1585         def __init__(self, google_ie, downloader=None):
1586                 InfoExtractor.__init__(self, downloader)
1587                 self._google_ie = google_ie
1588
1589         @staticmethod
1590         def suitable(url):
1591                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1592
1593         def report_download_page(self, query, pagenum):
1594                 """Report attempt to download playlist page with given number."""
1595                 query = query.decode(preferredencoding())
1596                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1597
1598         def _real_initialize(self):
1599                 self._google_ie.initialize()
1600
1601         def _real_extract(self, query):
1602                 mobj = re.match(self._VALID_QUERY, query)
1603                 if mobj is None:
1604                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1605                         return
1606
1607                 prefix, query = query.split(':')
1608                 prefix = prefix[8:]
1609                 query  = query.encode('utf-8')
1610                 if prefix == '':
1611                         self._download_n_results(query, 1)
1612                         return
1613                 elif prefix == 'all':
1614                         self._download_n_results(query, self._max_google_results)
1615                         return
1616                 else:
1617                         try:
1618                                 n = long(prefix)
1619                                 if n <= 0:
1620                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1621                                         return
1622                                 elif n > self._max_google_results:
1623                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1624                                         n = self._max_google_results
1625                                 self._download_n_results(query, n)
1626                                 return
1627                         except ValueError: # parsing prefix as integer fails
1628                                 self._download_n_results(query, 1)
1629                                 return
1630
1631         def _download_n_results(self, query, n):
1632                 """Downloads a specified number of results for a query"""
1633
1634                 video_ids = []
1635                 already_seen = set()
1636                 pagenum = 1
1637
1638                 while True:
1639                         self.report_download_page(query, pagenum)
1640                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1641                         request = urllib2.Request(result_url, None, std_headers)
1642                         try:
1643                                 page = urllib2.urlopen(request).read()
1644                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1645                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1646                                 return
1647
1648                         # Extract video identifiers
1649                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1650                                 video_id = mobj.group(1)
1651                                 if video_id not in already_seen:
1652                                         video_ids.append(video_id)
1653                                         already_seen.add(video_id)
1654                                         if len(video_ids) == n:
1655                                                 # Specified n videos reached
1656                                                 for id in video_ids:
1657                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1658                                                 return
1659
1660                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1661                                 for id in video_ids:
1662                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1663                                 return
1664
1665                         pagenum = pagenum + 1
1666
1667 class YahooSearchIE(InfoExtractor):
1668         """Information Extractor for Yahoo! Video search queries."""
1669         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1670         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1671         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1672         _MORE_PAGES_INDICATOR = r'\s*Next'
1673         _yahoo_ie = None
1674         _max_yahoo_results = 1000
1675
1676         def __init__(self, yahoo_ie, downloader=None):
1677                 InfoExtractor.__init__(self, downloader)
1678                 self._yahoo_ie = yahoo_ie
1679
1680         @staticmethod
1681         def suitable(url):
1682                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1683
1684         def report_download_page(self, query, pagenum):
1685                 """Report attempt to download playlist page with given number."""
1686                 query = query.decode(preferredencoding())
1687                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1688
1689         def _real_initialize(self):
1690                 self._yahoo_ie.initialize()
1691
1692         def _real_extract(self, query):
1693                 mobj = re.match(self._VALID_QUERY, query)
1694                 if mobj is None:
1695                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1696                         return
1697
1698                 prefix, query = query.split(':')
1699                 prefix = prefix[8:]
1700                 query  = query.encode('utf-8')
1701                 if prefix == '':
1702                         self._download_n_results(query, 1)
1703                         return
1704                 elif prefix == 'all':
1705                         self._download_n_results(query, self._max_yahoo_results)
1706                         return
1707                 else:
1708                         try:
1709                                 n = long(prefix)
1710                                 if n <= 0:
1711                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1712                                         return
1713                                 elif n > self._max_yahoo_results:
1714                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1715                                         n = self._max_yahoo_results
1716                                 self._download_n_results(query, n)
1717                                 return
1718                         except ValueError: # parsing prefix as integer fails
1719                                 self._download_n_results(query, 1)
1720                                 return
1721
1722         def _download_n_results(self, query, n):
1723                 """Downloads a specified number of results for a query"""
1724
1725                 video_ids = []
1726                 already_seen = set()
1727                 pagenum = 1
1728
1729                 while True:
1730                         self.report_download_page(query, pagenum)
1731                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1732                         request = urllib2.Request(result_url, None, std_headers)
1733                         try:
1734                                 page = urllib2.urlopen(request).read()
1735                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1736                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1737                                 return
1738
1739                         # Extract video identifiers
1740                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1741                                 video_id = mobj.group(1)
1742                                 if video_id not in already_seen:
1743                                         video_ids.append(video_id)
1744                                         already_seen.add(video_id)
1745                                         if len(video_ids) == n:
1746                                                 # Specified n videos reached
1747                                                 for id in video_ids:
1748                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1749                                                 return
1750
1751                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1752                                 for id in video_ids:
1753                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1754                                 return
1755
1756                         pagenum = pagenum + 1
1757
1758 class YoutubePlaylistIE(InfoExtractor):
1759         """Information Extractor for YouTube playlists."""
1760
1761         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1762         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1763         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1764         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1765         _youtube_ie = None
1766
1767         def __init__(self, youtube_ie, downloader=None):
1768                 InfoExtractor.__init__(self, downloader)
1769                 self._youtube_ie = youtube_ie
1770
1771         @staticmethod
1772         def suitable(url):
1773                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1774
1775         def report_download_page(self, playlist_id, pagenum):
1776                 """Report attempt to download playlist page with given number."""
1777                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1778
1779         def _real_initialize(self):
1780                 self._youtube_ie.initialize()
1781
1782         def _real_extract(self, url):
1783                 # Extract playlist id
1784                 mobj = re.match(self._VALID_URL, url)
1785                 if mobj is None:
1786                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1787                         return
1788
1789                 # Download playlist pages
1790                 playlist_id = mobj.group(1)
1791                 video_ids = []
1792                 pagenum = 1
1793
1794                 while True:
1795                         self.report_download_page(playlist_id, pagenum)
1796                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1797                         try:
1798                                 page = urllib2.urlopen(request).read()
1799                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1800                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1801                                 return
1802
1803                         # Extract video identifiers
1804                         ids_in_page = []
1805                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1806                                 if mobj.group(1) not in ids_in_page:
1807                                         ids_in_page.append(mobj.group(1))
1808                         video_ids.extend(ids_in_page)
1809
1810                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1811                                 break
1812                         pagenum = pagenum + 1
1813
1814                 for id in video_ids:
1815                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1816                 return
1817
1818 class YoutubeUserIE(InfoExtractor):
1819         """Information Extractor for YouTube users."""
1820
1821         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1822         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1823         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1824         _youtube_ie = None
1825
1826         def __init__(self, youtube_ie, downloader=None):
1827                 InfoExtractor.__init__(self, downloader)
1828                 self._youtube_ie = youtube_ie
1829
1830         @staticmethod
1831         def suitable(url):
1832                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1833
1834         def report_download_page(self, username):
1835                 """Report attempt to download user page."""
1836                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1837
1838         def _real_initialize(self):
1839                 self._youtube_ie.initialize()
1840
1841         def _real_extract(self, url):
1842                 # Extract username
1843                 mobj = re.match(self._VALID_URL, url)
1844                 if mobj is None:
1845                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1846                         return
1847
1848                 # Download user page
1849                 username = mobj.group(1)
1850                 video_ids = []
1851                 pagenum = 1
1852
1853                 self.report_download_page(username)
1854                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1855                 try:
1856                         page = urllib2.urlopen(request).read()
1857                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1858                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1859                         return
1860
1861                 # Extract video identifiers
1862                 ids_in_page = []
1863
1864                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1865                         if mobj.group(1) not in ids_in_page:
1866                                 ids_in_page.append(mobj.group(1))
1867                 video_ids.extend(ids_in_page)
1868
1869                 for id in video_ids:
1870                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1871                 return
1872
1873 class PostProcessor(object):
1874         """Post Processor class.
1875
1876         PostProcessor objects can be added to downloaders with their
1877         add_post_processor() method. When the downloader has finished a
1878         successful download, it will take its internal chain of PostProcessors
1879         and start calling the run() method on each one of them, first with
1880         an initial argument and then with the returned value of the previous
1881         PostProcessor.
1882
1883         The chain will be stopped if one of them ever returns None or the end
1884         of the chain is reached.
1885
1886         PostProcessor objects follow a "mutual registration" process similar
1887         to InfoExtractor objects.
1888         """
1889
1890         _downloader = None
1891
1892         def __init__(self, downloader=None):
1893                 self._downloader = downloader
1894
1895         def set_downloader(self, downloader):
1896                 """Sets the downloader for this PP."""
1897                 self._downloader = downloader
1898
1899         def run(self, information):
1900                 """Run the PostProcessor.
1901
1902                 The "information" argument is a dictionary like the ones
1903                 composed by InfoExtractors. The only difference is that this
1904                 one has an extra field called "filepath" that points to the
1905                 downloaded file.
1906
1907                 When this method returns None, the postprocessing chain is
1908                 stopped. However, this method may return an information
1909                 dictionary that will be passed to the next postprocessing
1910                 object in the chain. It can be the one it received after
1911                 changing some fields.
1912
1913                 In addition, this method may raise a PostProcessingError
1914                 exception that will be taken into account by the downloader
1915                 it was called from.
1916                 """
1917                 return information # by default, do nothing
1918
1919 ### MAIN PROGRAM ###
1920 if __name__ == '__main__':
1921         try:
1922                 # Modules needed only when running the main program
1923                 import getpass
1924                 import optparse
1925
1926                 # Function to update the program file with the latest version from bitbucket.org
1927                 def update_self(downloader, filename):
1928                         # Note: downloader only used for options
1929                         if not os.access (filename, os.W_OK):
1930                                 sys.exit('ERROR: no write permissions on %s' % filename)
1931
1932                         downloader.to_stdout('Updating to latest stable version...')
1933                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1934                         latest_version = urllib.urlopen(latest_url).read().strip()
1935                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1936                         newcontent = urllib.urlopen(prog_url).read()
1937                         stream = open(filename, 'w')
1938                         stream.write(newcontent)
1939                         stream.close()
1940                         downloader.to_stdout('Updated to version %s' % latest_version)
1941
1942                 # General configuration
1943                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1944                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1945                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1946
1947                 # Parse command line
1948                 parser = optparse.OptionParser(
1949                         usage='Usage: %prog [options] url...',
1950                         version='2010.04.04',
1951                         conflict_handler='resolve',
1952                 )
1953
1954                 parser.add_option('-h', '--help',
1955                                 action='help', help='print this help text and exit')
1956                 parser.add_option('-v', '--version',
1957                                 action='version', help='print program version and exit')
1958                 parser.add_option('-U', '--update',
1959                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1960                 parser.add_option('-i', '--ignore-errors',
1961                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1962                 parser.add_option('-r', '--rate-limit',
1963                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1964
1965                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1966                 authentication.add_option('-u', '--username',
1967                                 dest='username', metavar='UN', help='account username')
1968                 authentication.add_option('-p', '--password',
1969                                 dest='password', metavar='PW', help='account password')
1970                 authentication.add_option('-n', '--netrc',
1971                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1972                 parser.add_option_group(authentication)
1973
1974                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1975                 video_format.add_option('-f', '--format',
1976                                 action='store', dest='format', metavar='FMT', help='video format code')
1977                 video_format.add_option('-b', '--best-quality',
1978                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1979                 video_format.add_option('-m', '--mobile-version',
1980                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1981                 video_format.add_option('-d', '--high-def',
1982                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1983                 video_format.add_option('--all-formats',
1984                                 action='store_const', dest='format', help='download all available video formats', const='-1')
1985                 parser.add_option_group(video_format)
1986
1987                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1988                 verbosity.add_option('-q', '--quiet',
1989                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1990                 verbosity.add_option('-s', '--simulate',
1991                                 action='store_true', dest='simulate', help='do not download video', default=False)
1992                 verbosity.add_option('-g', '--get-url',
1993                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1994                 verbosity.add_option('-e', '--get-title',
1995                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1996                 verbosity.add_option('--get-thumbnail',
1997                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
1998                 verbosity.add_option('--get-description',
1999                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2000                 verbosity.add_option('--no-progress',
2001                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2002                 parser.add_option_group(verbosity)
2003
2004                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2005                 filesystem.add_option('-t', '--title',
2006                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2007                 filesystem.add_option('-l', '--literal',
2008                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2009                 filesystem.add_option('-o', '--output',
2010                                 dest='outtmpl', metavar='TPL', help='output filename template')
2011                 filesystem.add_option('-a', '--batch-file',
2012                                 dest='batchfile', metavar='F', help='file containing URLs to download')
2013                 filesystem.add_option('-w', '--no-overwrites',
2014                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2015                 filesystem.add_option('-c', '--continue',
2016                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2017                 parser.add_option_group(filesystem)
2018
2019                 (opts, args) = parser.parse_args()
2020
2021                 # Batch file verification
2022                 batchurls = []
2023                 if opts.batchfile is not None:
2024                         try:
2025                                 batchurls = open(opts.batchfile, 'r').readlines()
2026                                 batchurls = [x.strip() for x in batchurls]
2027                                 batchurls = [x for x in batchurls if len(x) > 0]
2028                         except IOError:
2029                                 sys.exit(u'ERROR: batch file could not be read')
2030                 all_urls = batchurls + args
2031
2032                 # Conflicting, missing and erroneous options
2033                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2034                         parser.error(u'using .netrc conflicts with giving username/password')
2035                 if opts.password is not None and opts.username is None:
2036                         parser.error(u'account username missing')
2037                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2038                         parser.error(u'using output template conflicts with using title or literal title')
2039                 if opts.usetitle and opts.useliteral:
2040                         parser.error(u'using title conflicts with using literal title')
2041                 if opts.username is not None and opts.password is None:
2042                         opts.password = getpass.getpass(u'Type account password and press return:')
2043                 if opts.ratelimit is not None:
2044                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2045                         if numeric_limit is None:
2046                                 parser.error(u'invalid rate limit specified')
2047                         opts.ratelimit = numeric_limit
2048
2049                 # Information extractors
2050                 youtube_ie = YoutubeIE()
2051                 metacafe_ie = MetacafeIE(youtube_ie)
2052                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2053                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2054                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2055                 google_ie = GoogleIE()
2056                 google_search_ie = GoogleSearchIE(google_ie)
2057                 photobucket_ie = PhotobucketIE()
2058                 yahoo_ie = YahooIE()
2059                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2060                 generic_ie = GenericIE()
2061
2062                 # File downloader
2063                 fd = FileDownloader({
2064                         'usenetrc': opts.usenetrc,
2065                         'username': opts.username,
2066                         'password': opts.password,
2067                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2068                         'forceurl': opts.geturl,
2069                         'forcetitle': opts.gettitle,
2070                         'forcethumbnail': opts.getthumbnail,
2071                         'forcedescription': opts.getdescription,
2072                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2073                         'format': opts.format,
2074                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2075                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2076                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2077                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2078                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2079                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2080                                 or u'%(id)s.%(ext)s'),
2081                         'ignoreerrors': opts.ignoreerrors,
2082                         'ratelimit': opts.ratelimit,
2083                         'nooverwrites': opts.nooverwrites,
2084                         'continuedl': opts.continue_dl,
2085                         'noprogress': opts.noprogress,
2086                         })
2087                 fd.add_info_extractor(youtube_search_ie)
2088                 fd.add_info_extractor(youtube_pl_ie)
2089                 fd.add_info_extractor(youtube_user_ie)
2090                 fd.add_info_extractor(metacafe_ie)
2091                 fd.add_info_extractor(youtube_ie)
2092                 fd.add_info_extractor(google_ie)
2093                 fd.add_info_extractor(google_search_ie)
2094                 fd.add_info_extractor(photobucket_ie)
2095                 fd.add_info_extractor(yahoo_ie)
2096                 fd.add_info_extractor(yahoo_search_ie)
2097
2098                 # This must come last since it's the
2099                 # fallback if none of the others work
2100                 fd.add_info_extractor(generic_ie)
2101
2102                 # Update version
2103                 if opts.update_self:
2104                         update_self(fd, sys.argv[0])
2105
2106                 # Maybe do nothing
2107                 if len(all_urls) < 1:
2108                         if not opts.update_self:
2109                                 parser.error(u'you must provide at least one URL')
2110                         else:
2111                                 sys.exit()
2112                 retcode = fd.download(all_urls)
2113                 sys.exit(retcode)
2114
2115         except DownloadError:
2116                 sys.exit(1)
2117         except SameFileError:
2118                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2119         except KeyboardInterrupt:
2120                 sys.exit(u'\nERROR: Interrupted by user')