git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         continuedl:     Try to continue downloads if possible.
 197         noprogress:     Do not print the progress bar.
 198         """
 199
 200         params = None
 201         _ies = []
 202         _pps = []
 203         _download_retcode = None
 204         _num_downloads = None
 205
 206         def __init__(self, params):
 207                 """Create a FileDownloader object with the given options."""
 208                 self._ies = []
 209                 self._pps = []
 210                 self._download_retcode = 0
 211                 self._num_downloads = 0
 212                 self.params = params
 213
 214         @staticmethod
 215         def pmkdir(filename):
 216                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 217                 components = filename.split(os.sep)
 218                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 219                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 220                 for dir in aggregate:
 221                         if not os.path.exists(dir):
 222                                 os.mkdir(dir)
 223
 224         @staticmethod
 225         def format_bytes(bytes):
 226                 if bytes is None:
 227                         return 'N/A'
 228                 if type(bytes) is str:
 229                         bytes = float(bytes)
 230                 if bytes == 0.0:
 231                         exponent = 0
 232                 else:
 233                         exponent = long(math.log(bytes, 1024.0))
 234                 suffix = 'bkMGTPEZY'[exponent]
 235                 converted = float(bytes) / float(1024**exponent)
 236                 return '%.2f%s' % (converted, suffix)
 237
 238         @staticmethod
 239         def calc_percent(byte_counter, data_len):
 240                 if data_len is None:
 241                         return '---.-%'
 242                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 243
 244         @staticmethod
 245         def calc_eta(start, now, total, current):
 246                 if total is None:
 247                         return '--:--'
 248                 dif = now - start
 249                 if current == 0 or dif < 0.001: # One millisecond
 250                         return '--:--'
 251                 rate = float(current) / dif
 252                 eta = long((float(total) - float(current)) / rate)
 253                 (eta_mins, eta_secs) = divmod(eta, 60)
 254                 if eta_mins > 99:
 255                         return '--:--'
 256                 return '%02d:%02d' % (eta_mins, eta_secs)
 257
 258         @staticmethod
 259         def calc_speed(start, now, bytes):
 260                 dif = now - start
 261                 if bytes == 0 or dif < 0.001: # One millisecond
 262                         return '%10s' % '---b/s'
 263                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 264
 265         @staticmethod
 266         def best_block_size(elapsed_time, bytes):
 267                 new_min = max(bytes / 2.0, 1.0)
 268                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 269                 if elapsed_time < 0.001:
 270                         return long(new_max)
 271                 rate = bytes / elapsed_time
 272                 if rate > new_max:
 273                         return long(new_max)
 274                 if rate < new_min:
 275                         return long(new_min)
 276                 return long(rate)
 277
 278         @staticmethod
 279         def parse_bytes(bytestr):
 280                 """Parse a string indicating a byte quantity into a long integer."""
 281                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 282                 if matchobj is None:
 283                         return None
 284                 number = float(matchobj.group(1))
 285                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 286                 return long(round(number * multiplier))
 287
 288         @staticmethod
 289         def verify_url(url):
 290                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 291                 request = urllib2.Request(url, None, std_headers)
 292                 data = urllib2.urlopen(request)
 293                 data.read(1)
 294                 url = data.geturl()
 295                 data.close()
 296                 return url
 297
 298         def add_info_extractor(self, ie):
 299                 """Add an InfoExtractor object to the end of the list."""
 300                 self._ies.append(ie)
 301                 ie.set_downloader(self)
 302
 303         def add_post_processor(self, pp):
 304                 """Add a PostProcessor object to the end of the chain."""
 305                 self._pps.append(pp)
 306                 pp.set_downloader(self)
 307
 308         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 309                 """Print message to stdout if not in quiet mode."""
 310                 try:
 311                         if not self.params.get('quiet', False):
 312                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 313                         sys.stdout.flush()
 314                 except (UnicodeEncodeError), err:
 315                         if not ignore_encoding_errors:
 316                                 raise
 317
 318         def to_stderr(self, message):
 319                 """Print message to stderr."""
 320                 print >>sys.stderr, message.encode(preferredencoding())
 321
 322         def fixed_template(self):
 323                 """Checks if the output template is fixed."""
 324                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 325
 326         def trouble(self, message=None):
 327                 """Determine action to take when a download problem appears.
 328
 329                 Depending on if the downloader has been configured to ignore
 330                 download errors or not, this method may throw an exception or
 331                 not when errors are found, after printing the message.
 332                 """
 333                 if message is not None:
 334                         self.to_stderr(message)
 335                 if not self.params.get('ignoreerrors', False):
 336                         raise DownloadError(message)
 337                 self._download_retcode = 1
 338
 339         def slow_down(self, start_time, byte_counter):
 340                 """Sleep if the download speed is over the rate limit."""
 341                 rate_limit = self.params.get('ratelimit', None)
 342                 if rate_limit is None or byte_counter == 0:
 343                         return
 344                 now = time.time()
 345                 elapsed = now - start_time
 346                 if elapsed <= 0.0:
 347                         return
 348                 speed = float(byte_counter) / elapsed
 349                 if speed > rate_limit:
 350                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 351
 352         def report_destination(self, filename):
 353                 """Report destination filename."""
 354                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 355
 356         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 357                 """Report download progress."""
 358                 if self.params.get('noprogress', False):
 359                         return
 360                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 361                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 362
 363         def report_resuming_byte(self, resume_len):
 364                 """Report attemtp to resume at given byte."""
 365                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 366
 367         def report_file_already_downloaded(self, file_name):
 368                 """Report file has already been fully downloaded."""
 369                 try:
 370                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 371                 except (UnicodeEncodeError), err:
 372                         self.to_stdout(u'[download] The file has already been downloaded')
 373
 374         def report_unable_to_resume(self):
 375                 """Report it was impossible to resume download."""
 376                 self.to_stdout(u'[download] Unable to resume')
 377
 378         def report_finish(self):
 379                 """Report download finished."""
 380                 if self.params.get('noprogress', False):
 381                         self.to_stdout(u'[download] Download completed')
 382                 else:
 383                         self.to_stdout(u'')
 384
 385         def process_info(self, info_dict):
 386                 """Process a single dictionary returned by an InfoExtractor."""
 387                 # Do nothing else if in simulate mode
 388                 if self.params.get('simulate', False):
 389                         # Verify URL if it's an HTTP one
 390                         if info_dict['url'].startswith('http'):
 391                                 try:
 392                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 393                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 394                                         raise UnavailableFormatError
 395
 396                         # Forced printings
 397                         if self.params.get('forcetitle', False):
 398                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 399                         if self.params.get('forceurl', False):
 400                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 401                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 402                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 403                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 404                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 405
 406                         return
 407
 408                 try:
 409                         template_dict = dict(info_dict)
 410                         template_dict['epoch'] = unicode(long(time.time()))
 411                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 412                         filename = self.params['outtmpl'] % template_dict
 413                 except (ValueError, KeyError), err:
 414                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 415                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 416                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 417                         return
 418
 419                 try:
 420                         self.pmkdir(filename)
 421                 except (OSError, IOError), err:
 422                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 423                         return
 424
 425                 try:
 426                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 427                 except (OSError, IOError), err:
 428                         raise UnavailableFormatError
 429                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 430                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 431                         return
 432                 except (ContentTooShortError, ), err:
 433                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 434                         return
 435
 436                 if success:
 437                         try:
 438                                 self.post_process(filename, info_dict)
 439                         except (PostProcessingError), err:
 440                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 441                                 return
 442
 443         def download(self, url_list):
 444                 """Download a given list of URLs."""
 445                 if len(url_list) > 1 and self.fixed_template():
 446                         raise SameFileError(self.params['outtmpl'])
 447
 448                 for url in url_list:
 449                         suitable_found = False
 450                         for ie in self._ies:
 451                                 # Go to next InfoExtractor if not suitable
 452                                 if not ie.suitable(url):
 453                                         continue
 454
 455                                 # Suitable InfoExtractor found
 456                                 suitable_found = True
 457
 458                                 # Extract information from URL and process it
 459                                 ie.extract(url)
 460
 461                                 # Suitable InfoExtractor had been found; go to next URL
 462                                 break
 463
 464                         if not suitable_found:
 465                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 466
 467                 return self._download_retcode
 468
 469         def post_process(self, filename, ie_info):
 470                 """Run the postprocessing chain on the given file."""
 471                 info = dict(ie_info)
 472                 info['filepath'] = filename
 473                 for pp in self._pps:
 474                         info = pp.run(info)
 475                         if info is None:
 476                                 break
 477
 478         def _download_with_rtmpdump(self, filename, url, player_url):
 479                 self.report_destination(filename)
 480
 481                 # Check for rtmpdump first
 482                 try:
 483                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 484                 except (OSError, IOError):
 485                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 486                         return False
 487
 488                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 489                 # the connection was interrumpted and resuming appears to be
 490                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 491                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 492                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 493                 while retval == 2 or retval == 1:
 494                         prevsize = os.path.getsize(filename)
 495                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 496                         time.sleep(5.0) # This seems to be needed
 497                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 498                         cursize = os.path.getsize(filename)
 499                         if prevsize == cursize and retval == 1:
 500                                 break
 501                 if retval == 0:
 502                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 503                         return True
 504                 else:
 505                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 506                         return False
 507
 508         def _do_download(self, filename, url, player_url):
 509                 # Attempt to download using rtmpdump
 510                 if url.startswith('rtmp'):
 511                         return self._download_with_rtmpdump(filename, url, player_url)
 512
 513                 stream = None
 514                 open_mode = 'wb'
 515                 basic_request = urllib2.Request(url, None, std_headers)
 516                 request = urllib2.Request(url, None, std_headers)
 517
 518                 # Establish possible resume length
 519                 if os.path.isfile(filename):
 520                         resume_len = os.path.getsize(filename)
 521                 else:
 522                         resume_len = 0
 523
 524                 # Request parameters in case of being able to resume
 525                 if self.params.get('continuedl', False) and resume_len != 0:
 526                         self.report_resuming_byte(resume_len)
 527                         request.add_header('Range','bytes=%d-' % resume_len)
 528                         open_mode = 'ab'
 529
 530                 # Establish connection
 531                 try:
 532                         data = urllib2.urlopen(request)
 533                 except (urllib2.HTTPError, ), err:
 534                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 535                                 raise
 536                         # Unable to resume
 537                         data = urllib2.urlopen(basic_request)
 538                         content_length = data.info()['Content-Length']
 539
 540                         if content_length is not None and long(content_length) == resume_len:
 541                                 # Because the file had already been fully downloaded
 542                                 self.report_file_already_downloaded(filename)
 543                                 self._num_downloads += 1
 544                                 return True
 545                         else:
 546                                 # Because the server didn't let us
 547                                 self.report_unable_to_resume()
 548                                 open_mode = 'wb'
 549
 550                 data_len = data.info().get('Content-length', None)
 551                 data_len_str = self.format_bytes(data_len)
 552                 byte_counter = 0
 553                 block_size = 1024
 554                 start = time.time()
 555                 while True:
 556                         # Download and write
 557                         before = time.time()
 558                         data_block = data.read(block_size)
 559                         after = time.time()
 560                         data_block_len = len(data_block)
 561                         if data_block_len == 0:
 562                                 break
 563                         byte_counter += data_block_len
 564
 565                         # Open file just in time
 566                         if stream is None:
 567                                 try:
 568                                         (stream, filename) = sanitize_open(filename, open_mode)
 569                                         self.report_destination(filename)
 570                                         self._num_downloads += 1
 571                                 except (OSError, IOError), err:
 572                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 573                                         return False
 574                         try:
 575                                 stream.write(data_block)
 576                         except (IOError, OSError), err:
 577                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 578                         block_size = self.best_block_size(after - before, data_block_len)
 579
 580                         # Progress message
 581                         percent_str = self.calc_percent(byte_counter, data_len)
 582                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 583                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 584                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 585
 586                         # Apply rate limit
 587                         self.slow_down(start, byte_counter)
 588
 589                 self.report_finish()
 590                 if data_len is not None and str(byte_counter) != data_len:
 591                         raise ContentTooShortError(byte_counter, long(data_len))
 592                 return True
 593
 594 class InfoExtractor(object):
 595         """Information Extractor class.
 596
 597         Information extractors are the classes that, given a URL, extract
 598         information from the video (or videos) the URL refers to. This
 599         information includes the real video URL, the video title and simplified
 600         title, author and others. The information is stored in a dictionary
 601         which is then passed to the FileDownloader. The FileDownloader
 602         processes this information possibly downloading the video to the file
 603         system, among other possible outcomes. The dictionaries must include
 604         the following fields:
 605
 606         id:             Video identifier.
 607         url:            Final video URL.
 608         uploader:       Nickname of the video uploader.
 609         title:          Literal title.
 610         stitle:         Simplified title.
 611         ext:            Video filename extension.
 612         format:         Video format.
 613         player_url:     SWF Player URL (may be None).
 614
 615         The following fields are optional. Their primary purpose is to allow
 616         youtube-dl to serve as the backend for a video search function, such
 617         as the one in youtube2mp3.  They are only used when their respective
 618         forced printing functions are called:
 619
 620         thumbnail:      Full URL to a video thumbnail image.
 621         description:    One-line video description.
 622
 623         Subclasses of this one should re-define the _real_initialize() and
 624         _real_extract() methods, as well as the suitable() static method.
 625         Probably, they should also be instantiated and added to the main
 626         downloader.
 627         """
 628
 629         _ready = False
 630         _downloader = None
 631
 632         def __init__(self, downloader=None):
 633                 """Constructor. Receives an optional downloader."""
 634                 self._ready = False
 635                 self.set_downloader(downloader)
 636
 637         @staticmethod
 638         def suitable(url):
 639                 """Receives a URL and returns True if suitable for this IE."""
 640                 return False
 641
 642         def initialize(self):
 643                 """Initializes an instance (authentication, etc)."""
 644                 if not self._ready:
 645                         self._real_initialize()
 646                         self._ready = True
 647
 648         def extract(self, url):
 649                 """Extracts URL information and returns it in list of dicts."""
 650                 self.initialize()
 651                 return self._real_extract(url)
 652
 653         def set_downloader(self, downloader):
 654                 """Sets the downloader for this IE."""
 655                 self._downloader = downloader
 656
 657         def _real_initialize(self):
 658                 """Real initialization process. Redefine in subclasses."""
 659                 pass
 660
 661         def _real_extract(self, url):
 662                 """Real extraction process. Redefine in subclasses."""
 663                 pass
 664
 665 class YoutubeIE(InfoExtractor):
 666         """Information extractor for youtube.com."""
 667
 668         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 669         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 670         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 671         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 672         _NETRC_MACHINE = 'youtube'
 673         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 674         _video_extensions = {
 675                 '13': '3gp',
 676                 '17': 'mp4',
 677                 '18': 'mp4',
 678                 '22': 'mp4',
 679                 '37': 'mp4',
 680         }
 681
 682         @staticmethod
 683         def suitable(url):
 684                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 685
 686         def report_lang(self):
 687                 """Report attempt to set language."""
 688                 self._downloader.to_stdout(u'[youtube] Setting language')
 689
 690         def report_login(self):
 691                 """Report attempt to log in."""
 692                 self._downloader.to_stdout(u'[youtube] Logging in')
 693
 694         def report_age_confirmation(self):
 695                 """Report attempt to confirm age."""
 696                 self._downloader.to_stdout(u'[youtube] Confirming age')
 697
 698         def report_video_webpage_download(self, video_id):
 699                 """Report attempt to download video webpage."""
 700                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 701
 702         def report_video_info_webpage_download(self, video_id):
 703                 """Report attempt to download video info webpage."""
 704                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 705
 706         def report_information_extraction(self, video_id):
 707                 """Report attempt to extract video information."""
 708                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 709
 710         def report_unavailable_format(self, video_id, format):
 711                 """Report extracted video URL."""
 712                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 713
 714         def report_rtmp_download(self):
 715                 """Indicate the download will use the RTMP protocol."""
 716                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 717
 718         def _real_initialize(self):
 719                 if self._downloader is None:
 720                         return
 721
 722                 username = None
 723                 password = None
 724                 downloader_params = self._downloader.params
 725
 726                 # Attempt to use provided username and password or .netrc data
 727                 if downloader_params.get('username', None) is not None:
 728                         username = downloader_params['username']
 729                         password = downloader_params['password']
 730                 elif downloader_params.get('usenetrc', False):
 731                         try:
 732                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 733                                 if info is not None:
 734                                         username = info[0]
 735                                         password = info[2]
 736                                 else:
 737                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 738                         except (IOError, netrc.NetrcParseError), err:
 739                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 740                                 return
 741
 742                 # Set language
 743                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 744                 try:
 745                         self.report_lang()
 746                         urllib2.urlopen(request).read()
 747                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 748                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 749                         return
 750
 751                 # No authentication to be performed
 752                 if username is None:
 753                         return
 754
 755                 # Log in
 756                 login_form = {
 757                                 'current_form': 'loginForm',
 758                                 'next':         '/',
 759                                 'action_login': 'Log In',
 760                                 'username':     username,
 761                                 'password':     password,
 762                                 }
 763                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 764                 try:
 765                         self.report_login()
 766                         login_results = urllib2.urlopen(request).read()
 767                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 768                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 769                                 return
 770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 771                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 772                         return
 773
 774                 # Confirm age
 775                 age_form = {
 776                                 'next_url':             '/',
 777                                 'action_confirm':       'Confirm',
 778                                 }
 779                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 780                 try:
 781                         self.report_age_confirmation()
 782                         age_results = urllib2.urlopen(request).read()
 783                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 784                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 785                         return
 786
 787         def _real_extract(self, url):
 788                 # Extract video id from URL
 789                 mobj = re.match(self._VALID_URL, url)
 790                 if mobj is None:
 791                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 792                         return
 793                 video_id = mobj.group(2)
 794
 795                 # Downloader parameters
 796                 best_quality = False
 797                 all_formats = False
 798                 format_param = None
 799                 quality_index = 0
 800                 if self._downloader is not None:
 801                         params = self._downloader.params
 802                         format_param = params.get('format', None)
 803                         if format_param == '0':
 804                                 format_param = self._available_formats[quality_index]
 805                                 best_quality = True
 806                         elif format_param == '-1':
 807                                 format_param = self._available_formats[quality_index]
 808                                 all_formats = True
 809
 810                 while True:
 811                         # Extension
 812                         video_extension = self._video_extensions.get(format_param, 'flv')
 813
 814                         # Get video webpage
 815                         self.report_video_webpage_download(video_id)
 816                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 817                         try:
 818                                 video_webpage = urllib2.urlopen(request).read()
 819                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 820                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 821                                 return
 822
 823                         # Attempt to extract SWF player URL
 824                         mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
 825                         if mobj is not None:
 826                                 player_url = mobj.group(1)
 827                         else:
 828                                 player_url = None
 829
 830                         # Get video info
 831                         self.report_video_info_webpage_download(video_id)
 832                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 833                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 834                                                    % (video_id, el_type))
 835                                 request = urllib2.Request(video_info_url, None, std_headers)
 836                                 try:
 837                                         video_info_webpage = urllib2.urlopen(request).read()
 838                                         video_info = parse_qs(video_info_webpage)
 839                                         if 'token' in video_info:
 840                                                 break
 841                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 842                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 843                                         return
 844                         self.report_information_extraction(video_id)
 845
 846                         # "t" param
 847                         if 'token' not in video_info:
 848                                 # Attempt to see if YouTube has issued an error message
 849                                 if 'reason' not in video_info:
 850                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 851                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 852                                         stream.write(video_info_webpage)
 853                                         stream.close()
 854                                 else:
 855                                         reason = urllib.unquote_plus(video_info['reason'][0])
 856                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 857                                 return
 858                         token = urllib.unquote_plus(video_info['token'][0])
 859                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 860                         if format_param is not None:
 861                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 862
 863                         # Check possible RTMP download
 864                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 865                                 self.report_rtmp_download()
 866                                 video_real_url = video_info['conn'][0]
 867
 868                         # uploader
 869                         if 'author' not in video_info:
 870                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 871                                 return
 872                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 873
 874                         # title
 875                         if 'title' not in video_info:
 876                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 877                                 return
 878                         video_title = urllib.unquote_plus(video_info['title'][0])
 879                         video_title = video_title.decode('utf-8')
 880                         video_title = sanitize_title(video_title)
 881
 882                         # simplified title
 883                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 884                         simple_title = simple_title.strip(ur'_')
 885
 886                         # thumbnail image
 887                         if 'thumbnail_url' not in video_info:
 888                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 889                                 video_thumbnail = ''
 890                         else:   # don't panic if we can't find it
 891                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 892
 893                         # description
 894                         video_description = 'No description available.'
 895                         if self._downloader.params.get('forcedescription', False):
 896                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 897                                 if mobj is not None:
 898                                         video_description = mobj.group(1)
 899
 900                         try:
 901                                 # Process video information
 902                                 self._downloader.process_info({
 903                                         'id':           video_id.decode('utf-8'),
 904                                         'url':          video_real_url.decode('utf-8'),
 905                                         'uploader':     video_uploader.decode('utf-8'),
 906                                         'title':        video_title,
 907                                         'stitle':       simple_title,
 908                                         'ext':          video_extension.decode('utf-8'),
 909                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 910                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 911                                         'description':  video_description.decode('utf-8'),
 912                                         'player_url':   player_url,
 913                                 })
 914
 915                                 if all_formats:
 916                                         if quality_index == len(self._available_formats):
 917                                                 # None left to get
 918                                                 return
 919                                         else:
 920                                                 quality_index += 1
 921                                                 format_param = self._available_formats[quality_index]
 922                                                 continue
 923                                 return
 924
 925                         except UnavailableFormatError, err:
 926                                 if best_quality or all_formats:
 927                                         if quality_index == len(self._available_formats):
 928                                                 # I don't ever expect this to happen
 929                                                 if not all_formats:
 930                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 931                                                 return
 932                                         else:
 933                                                 self.report_unavailable_format(video_id, format_param)
 934                                                 quality_index += 1
 935                                                 format_param = self._available_formats[quality_index]
 936                                                 continue
 937                                 else:
 938                                         self._downloader.trouble('ERROR: format not available for video')
 939                                         return
 940
 941
 942 class MetacafeIE(InfoExtractor):
 943         """Information Extractor for metacafe.com."""
 944
 945         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 946         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 947         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 948         _youtube_ie = None
 949
 950         def __init__(self, youtube_ie, downloader=None):
 951                 InfoExtractor.__init__(self, downloader)
 952                 self._youtube_ie = youtube_ie
 953
 954         @staticmethod
 955         def suitable(url):
 956                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 957
 958         def report_disclaimer(self):
 959                 """Report disclaimer retrieval."""
 960                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 961
 962         def report_age_confirmation(self):
 963                 """Report attempt to confirm age."""
 964                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 965
 966         def report_download_webpage(self, video_id):
 967                 """Report webpage download."""
 968                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 969
 970         def report_extraction(self, video_id):
 971                 """Report information extraction."""
 972                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 973
 974         def _real_initialize(self):
 975                 # Retrieve disclaimer
 976                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 977                 try:
 978                         self.report_disclaimer()
 979                         disclaimer = urllib2.urlopen(request).read()
 980                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 981                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 982                         return
 983
 984                 # Confirm age
 985                 disclaimer_form = {
 986                         'filters': '0',
 987                         'submit': "Continue - I'm over 18",
 988                         }
 989                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 990                 try:
 991                         self.report_age_confirmation()
 992                         disclaimer = urllib2.urlopen(request).read()
 993                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 994                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 995                         return
 996
 997         def _real_extract(self, url):
 998                 # Extract id and simplified title from URL
 999                 mobj = re.match(self._VALID_URL, url)
1000                 if mobj is None:
1001                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1002                         return
1003
1004                 video_id = mobj.group(1)
1005
1006                 # Check if video comes from YouTube
1007                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1008                 if mobj2 is not None:
1009                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1010                         return
1011
1012                 simple_title = mobj.group(2).decode('utf-8')
1013                 video_extension = 'flv'
1014
1015                 # Retrieve video webpage to extract further information
1016                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1017                 try:
1018                         self.report_download_webpage(video_id)
1019                         webpage = urllib2.urlopen(request).read()
1020                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1021                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1022                         return
1023
1024                 # Extract URL, uploader and title from webpage
1025                 self.report_extraction(video_id)
1026                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1027                 if mobj is None:
1028                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1029                         return
1030                 mediaURL = urllib.unquote(mobj.group(1))
1031
1032                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1033                 #if mobj is None:
1034                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1035                 #       return
1036                 #gdaKey = mobj.group(1)
1037                 #
1038                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1039
1040                 video_url = mediaURL
1041
1042                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1043                 if mobj is None:
1044                         self._downloader.trouble(u'ERROR: unable to extract title')
1045                         return
1046                 video_title = mobj.group(1).decode('utf-8')
1047                 video_title = sanitize_title(video_title)
1048
1049                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1050                 if mobj is None:
1051                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1052                         return
1053                 video_uploader = mobj.group(1)
1054
1055                 try:
1056                         # Process video information
1057                         self._downloader.process_info({
1058                                 'id':           video_id.decode('utf-8'),
1059                                 'url':          video_url.decode('utf-8'),
1060                                 'uploader':     video_uploader.decode('utf-8'),
1061                                 'title':        video_title,
1062                                 'stitle':       simple_title,
1063                                 'ext':          video_extension.decode('utf-8'),
1064                                 'format':       u'NA',
1065                                 'player_url':   None,
1066                         })
1067                 except UnavailableFormatError:
1068                         self._downloader.trouble(u'ERROR: format not available for video')
1069
1070
1071 class GoogleIE(InfoExtractor):
1072         """Information extractor for video.google.com."""
1073
1074         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1075
1076         def __init__(self, downloader=None):
1077                 InfoExtractor.__init__(self, downloader)
1078
1079         @staticmethod
1080         def suitable(url):
1081                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1082
1083         def report_download_webpage(self, video_id):
1084                 """Report webpage download."""
1085                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1086
1087         def report_extraction(self, video_id):
1088                 """Report information extraction."""
1089                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1090
1091         def _real_initialize(self):
1092                 return
1093
1094         def _real_extract(self, url):
1095                 # Extract id from URL
1096                 mobj = re.match(self._VALID_URL, url)
1097                 if mobj is None:
1098                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1099                         return
1100
1101                 video_id = mobj.group(1)
1102
1103                 video_extension = 'mp4'
1104
1105                 # Retrieve video webpage to extract further information
1106                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1107                 try:
1108                         self.report_download_webpage(video_id)
1109                         webpage = urllib2.urlopen(request).read()
1110                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1111                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1112                         return
1113
1114                 # Extract URL, uploader, and title from webpage
1115                 self.report_extraction(video_id)
1116                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1117                 if mobj is None:
1118                         video_extension = 'flv'
1119                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1120                 if mobj is None:
1121                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1122                         return
1123                 mediaURL = urllib.unquote(mobj.group(1))
1124                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1125                 mediaURL = mediaURL.replace('\\x26', '\x26')
1126
1127                 video_url = mediaURL
1128
1129                 mobj = re.search(r'<title>(.*)</title>', webpage)
1130                 if mobj is None:
1131                         self._downloader.trouble(u'ERROR: unable to extract title')
1132                         return
1133                 video_title = mobj.group(1).decode('utf-8')
1134                 video_title = sanitize_title(video_title)
1135                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1136
1137                 # Extract video description
1138                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1139                 if mobj is None:
1140                         self._downloader.trouble(u'ERROR: unable to extract video description')
1141                         return
1142                 video_description = mobj.group(1).decode('utf-8')
1143                 if not video_description:
1144                         video_description = 'No description available.'
1145
1146                 # Extract video thumbnail
1147                 if self._downloader.params.get('forcethumbnail', False):
1148                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1149                         try:
1150                                 webpage = urllib2.urlopen(request).read()
1151                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1152                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1153                                 return
1154                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1155                         if mobj is None:
1156                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1157                                 return
1158                         video_thumbnail = mobj.group(1)
1159                 else:   # we need something to pass to process_info
1160                         video_thumbnail = ''
1161
1162
1163                 try:
1164                         # Process video information
1165                         self._downloader.process_info({
1166                                 'id':           video_id.decode('utf-8'),
1167                                 'url':          video_url.decode('utf-8'),
1168                                 'uploader':     u'NA',
1169                                 'title':        video_title,
1170                                 'stitle':       simple_title,
1171                                 'ext':          video_extension.decode('utf-8'),
1172                                 'format':       u'NA',
1173                                 'player_url':   None,
1174                         })
1175                 except UnavailableFormatError:
1176                         self._downloader.trouble(u'ERROR: format not available for video')
1177
1178
1179 class PhotobucketIE(InfoExtractor):
1180         """Information extractor for photobucket.com."""
1181
1182         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1183
1184         def __init__(self, downloader=None):
1185                 InfoExtractor.__init__(self, downloader)
1186
1187         @staticmethod
1188         def suitable(url):
1189                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1190
1191         def report_download_webpage(self, video_id):
1192                 """Report webpage download."""
1193                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1194
1195         def report_extraction(self, video_id):
1196                 """Report information extraction."""
1197                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1198
1199         def _real_initialize(self):
1200                 return
1201
1202         def _real_extract(self, url):
1203                 # Extract id from URL
1204                 mobj = re.match(self._VALID_URL, url)
1205                 if mobj is None:
1206                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1207                         return
1208
1209                 video_id = mobj.group(1)
1210
1211                 video_extension = 'flv'
1212
1213                 # Retrieve video webpage to extract further information
1214                 request = urllib2.Request(url)
1215                 try:
1216                         self.report_download_webpage(video_id)
1217                         webpage = urllib2.urlopen(request).read()
1218                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1220                         return
1221
1222                 # Extract URL, uploader, and title from webpage
1223                 self.report_extraction(video_id)
1224                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1225                 if mobj is None:
1226                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1227                         return
1228                 mediaURL = urllib.unquote(mobj.group(1))
1229
1230                 video_url = mediaURL
1231
1232                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1233                 if mobj is None:
1234                         self._downloader.trouble(u'ERROR: unable to extract title')
1235                         return
1236                 video_title = mobj.group(1).decode('utf-8')
1237                 video_title = sanitize_title(video_title)
1238                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1239
1240                 video_uploader = mobj.group(2).decode('utf-8')
1241
1242                 try:
1243                         # Process video information
1244                         self._downloader.process_info({
1245                                 'id':           video_id.decode('utf-8'),
1246                                 'url':          video_url.decode('utf-8'),
1247                                 'uploader':     video_uploader,
1248                                 'title':        video_title,
1249                                 'stitle':       simple_title,
1250                                 'ext':          video_extension.decode('utf-8'),
1251                                 'format':       u'NA',
1252                                 'player_url':   None,
1253                         })
1254                 except UnavailableFormatError:
1255                         self._downloader.trouble(u'ERROR: format not available for video')
1256
1257
1258 class YahooIE(InfoExtractor):
1259         """Information extractor for video.yahoo.com."""
1260
1261         # _VALID_URL matches all Yahoo! Video URLs
1262         # _VPAGE_URL matches only the extractable '/watch/' URLs
1263         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1264         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1265
1266         def __init__(self, downloader=None):
1267                 InfoExtractor.__init__(self, downloader)
1268
1269         @staticmethod
1270         def suitable(url):
1271                 return (re.match(YahooIE._VALID_URL, url) is not None)
1272
1273         def report_download_webpage(self, video_id):
1274                 """Report webpage download."""
1275                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1276
1277         def report_extraction(self, video_id):
1278                 """Report information extraction."""
1279                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1280
1281         def _real_initialize(self):
1282                 return
1283
1284         def _real_extract(self, url):
1285                 # Extract ID from URL
1286                 mobj = re.match(self._VALID_URL, url)
1287                 if mobj is None:
1288                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1289                         return
1290
1291                 video_id = mobj.group(2)
1292                 video_extension = 'flv'
1293
1294                 # Rewrite valid but non-extractable URLs as
1295                 # extractable English language /watch/ URLs
1296                 if re.match(self._VPAGE_URL, url) is None:
1297                         request = urllib2.Request(url)
1298                         try:
1299                                 webpage = urllib2.urlopen(request).read()
1300                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1301                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1302                                 return
1303
1304                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1305                         if mobj is None:
1306                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1307                                 return
1308                         yahoo_id = mobj.group(1)
1309
1310                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1311                         if mobj is None:
1312                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1313                                 return
1314                         yahoo_vid = mobj.group(1)
1315
1316                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1317                         return self._real_extract(url)
1318
1319                 # Retrieve video webpage to extract further information
1320                 request = urllib2.Request(url)
1321                 try:
1322                         self.report_download_webpage(video_id)
1323                         webpage = urllib2.urlopen(request).read()
1324                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1325                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1326                         return
1327
1328                 # Extract uploader and title from webpage
1329                 self.report_extraction(video_id)
1330                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1331                 if mobj is None:
1332                         self._downloader.trouble(u'ERROR: unable to extract video title')
1333                         return
1334                 video_title = mobj.group(1).decode('utf-8')
1335                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1336
1337                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1338                 if mobj is None:
1339                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1340                         return
1341                 video_uploader = mobj.group(1).decode('utf-8')
1342
1343                 # Extract video thumbnail
1344                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1345                 if mobj is None:
1346                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1347                         return
1348                 video_thumbnail = mobj.group(1).decode('utf-8')
1349
1350                 # Extract video description
1351                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1352                 if mobj is None:
1353                         self._downloader.trouble(u'ERROR: unable to extract video description')
1354                         return
1355                 video_description = mobj.group(1).decode('utf-8')
1356                 if not video_description: video_description = 'No description available.'
1357
1358                 # Extract video height and width
1359                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1360                 if mobj is None:
1361                         self._downloader.trouble(u'ERROR: unable to extract video height')
1362                         return
1363                 yv_video_height = mobj.group(1)
1364
1365                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1366                 if mobj is None:
1367                         self._downloader.trouble(u'ERROR: unable to extract video width')
1368                         return
1369                 yv_video_width = mobj.group(1)
1370
1371                 # Retrieve video playlist to extract media URL
1372                 # I'm not completely sure what all these options are, but we
1373                 # seem to need most of them, otherwise the server sends a 401.
1374                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1375                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1376                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1377                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1378                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1379                 try:
1380                         self.report_download_webpage(video_id)
1381                         webpage = urllib2.urlopen(request).read()
1382                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1383                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1384                         return
1385
1386                 # Extract media URL from playlist XML
1387                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1388                 if mobj is None:
1389                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1390                         return
1391                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1392                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1393
1394                 try:
1395                         # Process video information
1396                         self._downloader.process_info({
1397                                 'id':           video_id.decode('utf-8'),
1398                                 'url':          video_url,
1399                                 'uploader':     video_uploader,
1400                                 'title':        video_title,
1401                                 'stitle':       simple_title,
1402                                 'ext':          video_extension.decode('utf-8'),
1403                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1404                                 'description':  video_description,
1405                                 'thumbnail':    video_thumbnail,
1406                                 'description':  video_description,
1407                                 'player_url':   None,
1408                         })
1409                 except UnavailableFormatError:
1410                         self._downloader.trouble(u'ERROR: format not available for video')
1411
1412
1413 class GenericIE(InfoExtractor):
1414         """Generic last-resort information extractor."""
1415
1416         def __init__(self, downloader=None):
1417                 InfoExtractor.__init__(self, downloader)
1418
1419         @staticmethod
1420         def suitable(url):
1421                 return True
1422
1423         def report_download_webpage(self, video_id):
1424                 """Report webpage download."""
1425                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1426                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1427
1428         def report_extraction(self, video_id):
1429                 """Report information extraction."""
1430                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1431
1432         def _real_initialize(self):
1433                 return
1434
1435         def _real_extract(self, url):
1436                 video_id = url.split('/')[-1]
1437                 request = urllib2.Request(url)
1438                 try:
1439                         self.report_download_webpage(video_id)
1440                         webpage = urllib2.urlopen(request).read()
1441                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1443                         return
1444                 except ValueError, err:
1445                         # since this is the last-resort InfoExtractor, if
1446                         # this error is thrown, it'll be thrown here
1447                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1448                         return
1449
1450                 # Start with something easy: JW Player in SWFObject
1451                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1452                 if mobj is None:
1453                         # Broaden the search a little bit
1454                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1455                 if mobj is None:
1456                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1457                         return
1458
1459                 # It's possible that one of the regexes
1460                 # matched, but returned an empty group:
1461                 if mobj.group(1) is None:
1462                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1463                         return
1464
1465                 video_url = urllib.unquote(mobj.group(1))
1466                 video_id  = os.path.basename(video_url)
1467
1468                 # here's a fun little line of code for you:
1469                 video_extension = os.path.splitext(video_id)[1][1:]
1470                 video_id        = os.path.splitext(video_id)[0]
1471
1472                 # it's tempting to parse this further, but you would
1473                 # have to take into account all the variations like
1474                 #   Video Title - Site Name
1475                 #   Site Name | Video Title
1476                 #   Video Title - Tagline | Site Name
1477                 # and so on and so forth; it's just not practical
1478                 mobj = re.search(r'<title>(.*)</title>', webpage)
1479                 if mobj is None:
1480                         self._downloader.trouble(u'ERROR: unable to extract title')
1481                         return
1482                 video_title = mobj.group(1).decode('utf-8')
1483                 video_title = sanitize_title(video_title)
1484                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1485
1486                 # video uploader is domain name
1487                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1488                 if mobj is None:
1489                         self._downloader.trouble(u'ERROR: unable to extract title')
1490                         return
1491                 video_uploader = mobj.group(1).decode('utf-8')
1492
1493                 try:
1494                         # Process video information
1495                         self._downloader.process_info({
1496                                 'id':           video_id.decode('utf-8'),
1497                                 'url':          video_url.decode('utf-8'),
1498                                 'uploader':     video_uploader,
1499                                 'title':        video_title,
1500                                 'stitle':       simple_title,
1501                                 'ext':          video_extension.decode('utf-8'),
1502                                 'format':       u'NA',
1503                                 'player_url':   None,
1504                         })
1505                 except UnavailableFormatError:
1506                         self._downloader.trouble(u'ERROR: format not available for video')
1507
1508
1509 class YoutubeSearchIE(InfoExtractor):
1510         """Information Extractor for YouTube search queries."""
1511         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1512         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1513         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1514         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1515         _youtube_ie = None
1516         _max_youtube_results = 1000
1517
1518         def __init__(self, youtube_ie, downloader=None):
1519                 InfoExtractor.__init__(self, downloader)
1520                 self._youtube_ie = youtube_ie
1521
1522         @staticmethod
1523         def suitable(url):
1524                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1525
1526         def report_download_page(self, query, pagenum):
1527                 """Report attempt to download playlist page with given number."""
1528                 query = query.decode(preferredencoding())
1529                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1530
1531         def _real_initialize(self):
1532                 self._youtube_ie.initialize()
1533
1534         def _real_extract(self, query):
1535                 mobj = re.match(self._VALID_QUERY, query)
1536                 if mobj is None:
1537                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1538                         return
1539
1540                 prefix, query = query.split(':')
1541                 prefix = prefix[8:]
1542                 query  = query.encode('utf-8')
1543                 if prefix == '':
1544                         self._download_n_results(query, 1)
1545                         return
1546                 elif prefix == 'all':
1547                         self._download_n_results(query, self._max_youtube_results)
1548                         return
1549                 else:
1550                         try:
1551                                 n = long(prefix)
1552                                 if n <= 0:
1553                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1554                                         return
1555                                 elif n > self._max_youtube_results:
1556                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1557                                         n = self._max_youtube_results
1558                                 self._download_n_results(query, n)
1559                                 return
1560                         except ValueError: # parsing prefix as integer fails
1561                                 self._download_n_results(query, 1)
1562                                 return
1563
1564         def _download_n_results(self, query, n):
1565                 """Downloads a specified number of results for a query"""
1566
1567                 video_ids = []
1568                 already_seen = set()
1569                 pagenum = 1
1570
1571                 while True:
1572                         self.report_download_page(query, pagenum)
1573                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1574                         request = urllib2.Request(result_url, None, std_headers)
1575                         try:
1576                                 page = urllib2.urlopen(request).read()
1577                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1578                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1579                                 return
1580
1581                         # Extract video identifiers
1582                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1583                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1584                                 if video_id not in already_seen:
1585                                         video_ids.append(video_id)
1586                                         already_seen.add(video_id)
1587                                         if len(video_ids) == n:
1588                                                 # Specified n videos reached
1589                                                 for id in video_ids:
1590                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1591                                                 return
1592
1593                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1594                                 for id in video_ids:
1595                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1596                                 return
1597
1598                         pagenum = pagenum + 1
1599
1600 class GoogleSearchIE(InfoExtractor):
1601         """Information Extractor for Google Video search queries."""
1602         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1603         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1604         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1605         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1606         _google_ie = None
1607         _max_google_results = 1000
1608
1609         def __init__(self, google_ie, downloader=None):
1610                 InfoExtractor.__init__(self, downloader)
1611                 self._google_ie = google_ie
1612
1613         @staticmethod
1614         def suitable(url):
1615                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1616
1617         def report_download_page(self, query, pagenum):
1618                 """Report attempt to download playlist page with given number."""
1619                 query = query.decode(preferredencoding())
1620                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1621
1622         def _real_initialize(self):
1623                 self._google_ie.initialize()
1624
1625         def _real_extract(self, query):
1626                 mobj = re.match(self._VALID_QUERY, query)
1627                 if mobj is None:
1628                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1629                         return
1630
1631                 prefix, query = query.split(':')
1632                 prefix = prefix[8:]
1633                 query  = query.encode('utf-8')
1634                 if prefix == '':
1635                         self._download_n_results(query, 1)
1636                         return
1637                 elif prefix == 'all':
1638                         self._download_n_results(query, self._max_google_results)
1639                         return
1640                 else:
1641                         try:
1642                                 n = long(prefix)
1643                                 if n <= 0:
1644                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1645                                         return
1646                                 elif n > self._max_google_results:
1647                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1648                                         n = self._max_google_results
1649                                 self._download_n_results(query, n)
1650                                 return
1651                         except ValueError: # parsing prefix as integer fails
1652                                 self._download_n_results(query, 1)
1653                                 return
1654
1655         def _download_n_results(self, query, n):
1656                 """Downloads a specified number of results for a query"""
1657
1658                 video_ids = []
1659                 already_seen = set()
1660                 pagenum = 1
1661
1662                 while True:
1663                         self.report_download_page(query, pagenum)
1664                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1665                         request = urllib2.Request(result_url, None, std_headers)
1666                         try:
1667                                 page = urllib2.urlopen(request).read()
1668                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1669                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1670                                 return
1671
1672                         # Extract video identifiers
1673                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1674                                 video_id = mobj.group(1)
1675                                 if video_id not in already_seen:
1676                                         video_ids.append(video_id)
1677                                         already_seen.add(video_id)
1678                                         if len(video_ids) == n:
1679                                                 # Specified n videos reached
1680                                                 for id in video_ids:
1681                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1682                                                 return
1683
1684                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1685                                 for id in video_ids:
1686                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1687                                 return
1688
1689                         pagenum = pagenum + 1
1690
1691 class YahooSearchIE(InfoExtractor):
1692         """Information Extractor for Yahoo! Video search queries."""
1693         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1694         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1695         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1696         _MORE_PAGES_INDICATOR = r'\s*Next'
1697         _yahoo_ie = None
1698         _max_yahoo_results = 1000
1699
1700         def __init__(self, yahoo_ie, downloader=None):
1701                 InfoExtractor.__init__(self, downloader)
1702                 self._yahoo_ie = yahoo_ie
1703
1704         @staticmethod
1705         def suitable(url):
1706                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1707
1708         def report_download_page(self, query, pagenum):
1709                 """Report attempt to download playlist page with given number."""
1710                 query = query.decode(preferredencoding())
1711                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1712
1713         def _real_initialize(self):
1714                 self._yahoo_ie.initialize()
1715
1716         def _real_extract(self, query):
1717                 mobj = re.match(self._VALID_QUERY, query)
1718                 if mobj is None:
1719                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1720                         return
1721
1722                 prefix, query = query.split(':')
1723                 prefix = prefix[8:]
1724                 query  = query.encode('utf-8')
1725                 if prefix == '':
1726                         self._download_n_results(query, 1)
1727                         return
1728                 elif prefix == 'all':
1729                         self._download_n_results(query, self._max_yahoo_results)
1730                         return
1731                 else:
1732                         try:
1733                                 n = long(prefix)
1734                                 if n <= 0:
1735                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1736                                         return
1737                                 elif n > self._max_yahoo_results:
1738                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1739                                         n = self._max_yahoo_results
1740                                 self._download_n_results(query, n)
1741                                 return
1742                         except ValueError: # parsing prefix as integer fails
1743                                 self._download_n_results(query, 1)
1744                                 return
1745
1746         def _download_n_results(self, query, n):
1747                 """Downloads a specified number of results for a query"""
1748
1749                 video_ids = []
1750                 already_seen = set()
1751                 pagenum = 1
1752
1753                 while True:
1754                         self.report_download_page(query, pagenum)
1755                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1756                         request = urllib2.Request(result_url, None, std_headers)
1757                         try:
1758                                 page = urllib2.urlopen(request).read()
1759                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1760                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1761                                 return
1762
1763                         # Extract video identifiers
1764                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1765                                 video_id = mobj.group(1)
1766                                 if video_id not in already_seen:
1767                                         video_ids.append(video_id)
1768                                         already_seen.add(video_id)
1769                                         if len(video_ids) == n:
1770                                                 # Specified n videos reached
1771                                                 for id in video_ids:
1772                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1773                                                 return
1774
1775                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1776                                 for id in video_ids:
1777                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1778                                 return
1779
1780                         pagenum = pagenum + 1
1781
1782 class YoutubePlaylistIE(InfoExtractor):
1783         """Information Extractor for YouTube playlists."""
1784
1785         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1786         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1787         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1788         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1789         _youtube_ie = None
1790
1791         def __init__(self, youtube_ie, downloader=None):
1792                 InfoExtractor.__init__(self, downloader)
1793                 self._youtube_ie = youtube_ie
1794
1795         @staticmethod
1796         def suitable(url):
1797                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1798
1799         def report_download_page(self, playlist_id, pagenum):
1800                 """Report attempt to download playlist page with given number."""
1801                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1802
1803         def _real_initialize(self):
1804                 self._youtube_ie.initialize()
1805
1806         def _real_extract(self, url):
1807                 # Extract playlist id
1808                 mobj = re.match(self._VALID_URL, url)
1809                 if mobj is None:
1810                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1811                         return
1812
1813                 # Download playlist pages
1814                 playlist_id = mobj.group(1)
1815                 video_ids = []
1816                 pagenum = 1
1817
1818                 while True:
1819                         self.report_download_page(playlist_id, pagenum)
1820                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1821                         try:
1822                                 page = urllib2.urlopen(request).read()
1823                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1824                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1825                                 return
1826
1827                         # Extract video identifiers
1828                         ids_in_page = []
1829                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1830                                 if mobj.group(1) not in ids_in_page:
1831                                         ids_in_page.append(mobj.group(1))
1832                         video_ids.extend(ids_in_page)
1833
1834                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1835                                 break
1836                         pagenum = pagenum + 1
1837
1838                 for id in video_ids:
1839                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1840                 return
1841
1842 class YoutubeUserIE(InfoExtractor):
1843         """Information Extractor for YouTube users."""
1844
1845         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1846         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1847         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1848         _youtube_ie = None
1849
1850         def __init__(self, youtube_ie, downloader=None):
1851                 InfoExtractor.__init__(self, downloader)
1852                 self._youtube_ie = youtube_ie
1853
1854         @staticmethod
1855         def suitable(url):
1856                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1857
1858         def report_download_page(self, username):
1859                 """Report attempt to download user page."""
1860                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1861
1862         def _real_initialize(self):
1863                 self._youtube_ie.initialize()
1864
1865         def _real_extract(self, url):
1866                 # Extract username
1867                 mobj = re.match(self._VALID_URL, url)
1868                 if mobj is None:
1869                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1870                         return
1871
1872                 # Download user page
1873                 username = mobj.group(1)
1874                 video_ids = []
1875                 pagenum = 1
1876
1877                 self.report_download_page(username)
1878                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1879                 try:
1880                         page = urllib2.urlopen(request).read()
1881                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1882                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1883                         return
1884
1885                 # Extract video identifiers
1886                 ids_in_page = []
1887
1888                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1889                         if mobj.group(1) not in ids_in_page:
1890                                 ids_in_page.append(mobj.group(1))
1891                 video_ids.extend(ids_in_page)
1892
1893                 for id in video_ids:
1894                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1895                 return
1896
1897 class PostProcessor(object):
1898         """Post Processor class.
1899
1900         PostProcessor objects can be added to downloaders with their
1901         add_post_processor() method. When the downloader has finished a
1902         successful download, it will take its internal chain of PostProcessors
1903         and start calling the run() method on each one of them, first with
1904         an initial argument and then with the returned value of the previous
1905         PostProcessor.
1906
1907         The chain will be stopped if one of them ever returns None or the end
1908         of the chain is reached.
1909
1910         PostProcessor objects follow a "mutual registration" process similar
1911         to InfoExtractor objects.
1912         """
1913
1914         _downloader = None
1915
1916         def __init__(self, downloader=None):
1917                 self._downloader = downloader
1918
1919         def set_downloader(self, downloader):
1920                 """Sets the downloader for this PP."""
1921                 self._downloader = downloader
1922
1923         def run(self, information):
1924                 """Run the PostProcessor.
1925
1926                 The "information" argument is a dictionary like the ones
1927                 composed by InfoExtractors. The only difference is that this
1928                 one has an extra field called "filepath" that points to the
1929                 downloaded file.
1930
1931                 When this method returns None, the postprocessing chain is
1932                 stopped. However, this method may return an information
1933                 dictionary that will be passed to the next postprocessing
1934                 object in the chain. It can be the one it received after
1935                 changing some fields.
1936
1937                 In addition, this method may raise a PostProcessingError
1938                 exception that will be taken into account by the downloader
1939                 it was called from.
1940                 """
1941                 return information # by default, do nothing
1942
1943 ### MAIN PROGRAM ###
1944 if __name__ == '__main__':
1945         try:
1946                 # Modules needed only when running the main program
1947                 import getpass
1948                 import optparse
1949
1950                 # Function to update the program file with the latest version from bitbucket.org
1951                 def update_self(downloader, filename):
1952                         # Note: downloader only used for options
1953                         if not os.access (filename, os.W_OK):
1954                                 sys.exit('ERROR: no write permissions on %s' % filename)
1955
1956                         downloader.to_stdout('Updating to latest stable version...')
1957                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1958                         latest_version = urllib.urlopen(latest_url).read().strip()
1959                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1960                         newcontent = urllib.urlopen(prog_url).read()
1961                         stream = open(filename, 'w')
1962                         stream.write(newcontent)
1963                         stream.close()
1964                         downloader.to_stdout('Updated to version %s' % latest_version)
1965
1966                 # General configuration
1967                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1968                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1969                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1970
1971                 # Parse command line
1972                 parser = optparse.OptionParser(
1973                         usage='Usage: %prog [options] url...',
1974                         version='2010.04.04',
1975                         conflict_handler='resolve',
1976                 )
1977
1978                 parser.add_option('-h', '--help',
1979                                 action='help', help='print this help text and exit')
1980                 parser.add_option('-v', '--version',
1981                                 action='version', help='print program version and exit')
1982                 parser.add_option('-U', '--update',
1983                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1984                 parser.add_option('-i', '--ignore-errors',
1985                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1986                 parser.add_option('-r', '--rate-limit',
1987                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1988
1989                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1990                 authentication.add_option('-u', '--username',
1991                                 dest='username', metavar='UN', help='account username')
1992                 authentication.add_option('-p', '--password',
1993                                 dest='password', metavar='PW', help='account password')
1994                 authentication.add_option('-n', '--netrc',
1995                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1996                 parser.add_option_group(authentication)
1997
1998                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1999                 video_format.add_option('-f', '--format',
2000                                 action='store', dest='format', metavar='FMT', help='video format code')
2001                 video_format.add_option('-b', '--best-quality',
2002                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
2003                 video_format.add_option('-m', '--mobile-version',
2004                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2005                 video_format.add_option('-d', '--high-def',
2006                                 action='store_const', dest='format', help='alias for -f 22', const='22')
2007                 video_format.add_option('--all-formats',
2008                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2009                 parser.add_option_group(video_format)
2010
2011                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2012                 verbosity.add_option('-q', '--quiet',
2013                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2014                 verbosity.add_option('-s', '--simulate',
2015                                 action='store_true', dest='simulate', help='do not download video', default=False)
2016                 verbosity.add_option('-g', '--get-url',
2017                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2018                 verbosity.add_option('-e', '--get-title',
2019                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2020                 verbosity.add_option('--get-thumbnail',
2021                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2022                 verbosity.add_option('--get-description',
2023                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2024                 verbosity.add_option('--no-progress',
2025                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2026                 parser.add_option_group(verbosity)
2027
2028                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2029                 filesystem.add_option('-t', '--title',
2030                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2031                 filesystem.add_option('-l', '--literal',
2032                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2033                 filesystem.add_option('-o', '--output',
2034                                 dest='outtmpl', metavar='TPL', help='output filename template')
2035                 filesystem.add_option('-a', '--batch-file',
2036                                 dest='batchfile', metavar='F', help='file containing URLs to download (\'-\' for stdin)')
2037                 filesystem.add_option('-w', '--no-overwrites',
2038                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2039                 filesystem.add_option('-c', '--continue',
2040                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2041                 parser.add_option_group(filesystem)
2042
2043                 (opts, args) = parser.parse_args()
2044
2045                 # Batch file verification
2046                 batchurls = []
2047                 if opts.batchfile is not None:
2048                         try:
2049                                 if opts.batchfile == '-':
2050                                         batchfd = sys.stdin
2051                                 else:
2052                                         batchfd = open(opts.batchfile, 'r')
2053                                 batchurls = batchfd.readlines()
2054                                 batchurls = [x.strip() for x in batchurls]
2055                                 batchurls = [x for x in batchurls if len(x) > 0]
2056                         except IOError:
2057                                 sys.exit(u'ERROR: batch file could not be read')
2058                 all_urls = batchurls + args
2059
2060                 # Conflicting, missing and erroneous options
2061                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2062                         parser.error(u'using .netrc conflicts with giving username/password')
2063                 if opts.password is not None and opts.username is None:
2064                         parser.error(u'account username missing')
2065                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2066                         parser.error(u'using output template conflicts with using title or literal title')
2067                 if opts.usetitle and opts.useliteral:
2068                         parser.error(u'using title conflicts with using literal title')
2069                 if opts.username is not None and opts.password is None:
2070                         opts.password = getpass.getpass(u'Type account password and press return:')
2071                 if opts.ratelimit is not None:
2072                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2073                         if numeric_limit is None:
2074                                 parser.error(u'invalid rate limit specified')
2075                         opts.ratelimit = numeric_limit
2076
2077                 # Information extractors
2078                 youtube_ie = YoutubeIE()
2079                 metacafe_ie = MetacafeIE(youtube_ie)
2080                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2081                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2082                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2083                 google_ie = GoogleIE()
2084                 google_search_ie = GoogleSearchIE(google_ie)
2085                 photobucket_ie = PhotobucketIE()
2086                 yahoo_ie = YahooIE()
2087                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2088                 generic_ie = GenericIE()
2089
2090                 # File downloader
2091                 fd = FileDownloader({
2092                         'usenetrc': opts.usenetrc,
2093                         'username': opts.username,
2094                         'password': opts.password,
2095                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2096                         'forceurl': opts.geturl,
2097                         'forcetitle': opts.gettitle,
2098                         'forcethumbnail': opts.getthumbnail,
2099                         'forcedescription': opts.getdescription,
2100                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2101                         'format': opts.format,
2102                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2103                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2104                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2105                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2106                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2107                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2108                                 or u'%(id)s.%(ext)s'),
2109                         'ignoreerrors': opts.ignoreerrors,
2110                         'ratelimit': opts.ratelimit,
2111                         'nooverwrites': opts.nooverwrites,
2112                         'continuedl': opts.continue_dl,
2113                         'noprogress': opts.noprogress,
2114                         })
2115                 fd.add_info_extractor(youtube_search_ie)
2116                 fd.add_info_extractor(youtube_pl_ie)
2117                 fd.add_info_extractor(youtube_user_ie)
2118                 fd.add_info_extractor(metacafe_ie)
2119                 fd.add_info_extractor(youtube_ie)
2120                 fd.add_info_extractor(google_ie)
2121                 fd.add_info_extractor(google_search_ie)
2122                 fd.add_info_extractor(photobucket_ie)
2123                 fd.add_info_extractor(yahoo_ie)
2124                 fd.add_info_extractor(yahoo_search_ie)
2125
2126                 # This must come last since it's the
2127                 # fallback if none of the others work
2128                 fd.add_info_extractor(generic_ie)
2129
2130                 # Update version
2131                 if opts.update_self:
2132                         update_self(fd, sys.argv[0])
2133
2134                 # Maybe do nothing
2135                 if len(all_urls) < 1:
2136                         if not opts.update_self:
2137                                 parser.error(u'you must provide at least one URL')
2138                         else:
2139                                 sys.exit()
2140                 retcode = fd.download(all_urls)
2141                 sys.exit(retcode)
2142
2143         except DownloadError:
2144                 sys.exit(1)
2145         except SameFileError:
2146                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2147         except KeyboardInterrupt:
2148                 sys.exit(u'\nERROR: Interrupted by user')