_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # License: Public domain code
   9 import cookielib
  10 import ctypes
  11 import datetime
  12 import email.utils
  13 import gzip
  14 import htmlentitydefs
  15 import httplib
  16 import locale
  17 import math
  18 import netrc
  19 import os
  20 import os.path
  21 import re
  22 import socket
  23 import string
  24 import StringIO
  25 import subprocess
  26 import sys
  27 import time
  28 import urllib
  29 import urllib2
  30 import zlib
  31
  32 # parse_qs was moved from the cgi module to the urlparse module recently.
  33 try:
  34         from urlparse import parse_qs
  35 except ImportError:
  36         from cgi import parse_qs
  37
  38 std_headers = {
  39         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
  40         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  41         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  42         'Accept-Encoding': 'gzip, deflate',
  43         'Accept-Language': 'en-us,en;q=0.5',
  44 }
  45
  46 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  47
  48 def preferredencoding():
  49         """Get preferred encoding.
  50
  51         Returns the best encoding scheme for the system, based on
  52         locale.getpreferredencoding() and some further tweaks.
  53         """
  54         def yield_preferredencoding():
  55                 try:
  56                         pref = locale.getpreferredencoding()
  57                         u'TEST'.encode(pref)
  58                 except:
  59                         pref = 'UTF-8'
  60                 while True:
  61                         yield pref
  62         return yield_preferredencoding().next()
  63
  64 def htmlentity_transform(matchobj):
  65         """Transforms an HTML entity to a Unicode character.
  66
  67         This function receives a match object and is intended to be used with
  68         the re.sub() function.
  69         """
  70         entity = matchobj.group(1)
  71
  72         # Known non-numeric HTML entity
  73         if entity in htmlentitydefs.name2codepoint:
  74                 return unichr(htmlentitydefs.name2codepoint[entity])
  75
  76         # Unicode character
  77         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  78         if mobj is not None:
  79                 numstr = mobj.group(1)
  80                 if numstr.startswith(u'x'):
  81                         base = 16
  82                         numstr = u'0%s' % numstr
  83                 else:
  84                         base = 10
  85                 return unichr(long(numstr, base))
  86
  87         # Unknown entity in name, return its literal representation
  88         return (u'&%s;' % entity)
  89
  90 def sanitize_title(utitle):
  91         """Sanitizes a video title so it could be used as part of a filename."""
  92         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  93         return utitle.replace(unicode(os.sep), u'%')
  94
  95 def sanitize_open(filename, open_mode):
  96         """Try to open the given filename, and slightly tweak it if this fails.
  97
  98         Attempts to open the given filename. If this fails, it tries to change
  99         the filename slightly, step by step, until it's either able to open it
 100         or it fails and raises a final exception, like the standard open()
 101         function.
 102
 103         It returns the tuple (stream, definitive_file_name).
 104         """
 105         try:
 106                 if filename == u'-':
 107                         if sys.platform == 'win32':
 108                                 import msvcrt
 109                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 110                         return (sys.stdout, filename)
 111                 stream = open(filename, open_mode)
 112                 return (stream, filename)
 113         except (IOError, OSError), err:
 114                 # In case of error, try to remove win32 forbidden chars
 115                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 116
 117                 # An exception here should be caught in the caller
 118                 stream = open(filename, open_mode)
 119                 return (stream, filename)
 120
 121 def timeconvert(timestr):
 122     """Convert RFC 2822 defined time string into system timestamp"""
 123     timestamp = None
 124     timetuple = email.utils.parsedate_tz(timestr)
 125     if timetuple is not None:
 126         timestamp = email.utils.mktime_tz(timetuple)
 127     return timestamp
 128
 129 class DownloadError(Exception):
 130         """Download Error exception.
 131
 132         This exception may be thrown by FileDownloader objects if they are not
 133         configured to continue on errors. They will contain the appropriate
 134         error message.
 135         """
 136         pass
 137
 138 class SameFileError(Exception):
 139         """Same File exception.
 140
 141         This exception will be thrown by FileDownloader objects if they detect
 142         multiple files would have to be downloaded to the same file on disk.
 143         """
 144         pass
 145
 146 class PostProcessingError(Exception):
 147         """Post Processing exception.
 148
 149         This exception may be raised by PostProcessor's .run() method to
 150         indicate an error in the postprocessing task.
 151         """
 152         pass
 153
 154 class UnavailableVideoError(Exception):
 155         """Unavailable Format exception.
 156
 157         This exception will be thrown when a video is requested
 158         in a format that is not available for that video.
 159         """
 160         pass
 161
 162 class ContentTooShortError(Exception):
 163         """Content Too Short exception.
 164
 165         This exception may be raised by FileDownloader objects when a file they
 166         download is too small for what the server announced first, indicating
 167         the connection was probably interrupted.
 168         """
 169         # Both in bytes
 170         downloaded = None
 171         expected = None
 172
 173         def __init__(self, downloaded, expected):
 174                 self.downloaded = downloaded
 175                 self.expected = expected
 176
 177 class YoutubeDLHandler(urllib2.HTTPHandler):
 178         """Handler for HTTP requests and responses.
 179
 180         This class, when installed with an OpenerDirector, automatically adds
 181         the standard headers to every HTTP request and handles gzipped and
 182         deflated responses from web servers. If compression is to be avoided in
 183         a particular request, the original request in the program code only has
 184         to include the HTTP header "Youtubedl-No-Compression", which will be
 185         removed before making the real request.
 186
 187         Part of this code was copied from:
 188
 189           http://techknack.net/python-urllib2-handlers/
 190
 191         Andrew Rowls, the author of that code, agreed to release it to the
 192         public domain.
 193         """
 194
 195         @staticmethod
 196         def deflate(data):
 197                 try:
 198                         return zlib.decompress(data, -zlib.MAX_WBITS)
 199                 except zlib.error:
 200                         return zlib.decompress(data)
 201
 202         @staticmethod
 203         def addinfourl_wrapper(stream, headers, url, code):
 204                 if hasattr(urllib2.addinfourl, 'getcode'):
 205                         return urllib2.addinfourl(stream, headers, url, code)
 206                 ret = urllib2.addinfourl(stream, headers, url)
 207                 ret.code = code
 208                 return ret
 209
 210         def http_request(self, req):
 211                 for h in std_headers:
 212                         if h in req.headers:
 213                                 del req.headers[h]
 214                         req.add_header(h, std_headers[h])
 215                 if 'Youtubedl-no-compression' in req.headers:
 216                         if 'Accept-encoding' in req.headers:
 217                                 del req.headers['Accept-encoding']
 218                         del req.headers['Youtubedl-no-compression']
 219                 return req
 220
 221         def http_response(self, req, resp):
 222                 old_resp = resp
 223                 # gzip
 224                 if resp.headers.get('Content-encoding', '') == 'gzip':
 225                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 226                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 227                         resp.msg = old_resp.msg
 228                 # deflate
 229                 if resp.headers.get('Content-encoding', '') == 'deflate':
 230                         gz = StringIO.StringIO(self.deflate(resp.read()))
 231                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 232                         resp.msg = old_resp.msg
 233                 return resp
 234
 235 class FileDownloader(object):
 236         """File Downloader class.
 237
 238         File downloader objects are the ones responsible of downloading the
 239         actual video file and writing it to disk if the user has requested
 240         it, among some other tasks. In most cases there should be one per
 241         program. As, given a video URL, the downloader doesn't know how to
 242         extract all the needed information, task that InfoExtractors do, it
 243         has to pass the URL to one of them.
 244
 245         For this, file downloader objects have a method that allows
 246         InfoExtractors to be registered in a given order. When it is passed
 247         a URL, the file downloader handles it to the first InfoExtractor it
 248         finds that reports being able to handle it. The InfoExtractor extracts
 249         all the information about the video or videos the URL refers to, and
 250         asks the FileDownloader to process the video information, possibly
 251         downloading the video.
 252
 253         File downloaders accept a lot of parameters. In order not to saturate
 254         the object constructor with arguments, it receives a dictionary of
 255         options instead. These options are available through the params
 256         attribute for the InfoExtractors to use. The FileDownloader also
 257         registers itself as the downloader in charge for the InfoExtractors
 258         that are added to it, so this is a "mutual registration".
 259
 260         Available options:
 261
 262         username:         Username for authentication purposes.
 263         password:         Password for authentication purposes.
 264         usenetrc:         Use netrc for authentication instead.
 265         quiet:            Do not print messages to stdout.
 266         forceurl:         Force printing final URL.
 267         forcetitle:       Force printing title.
 268         forcethumbnail:   Force printing thumbnail URL.
 269         forcedescription: Force printing description.
 270         forcefilename:    Force printing final filename.
 271         simulate:         Do not download the video files.
 272         format:           Video format code.
 273         format_limit:     Highest quality format to try.
 274         outtmpl:          Template for output names.
 275         ignoreerrors:     Do not stop on download errors.
 276         ratelimit:        Download speed limit, in bytes/sec.
 277         nooverwrites:     Prevent overwriting files.
 278         retries:          Number of times to retry for HTTP error 5xx
 279         continuedl:       Try to continue downloads if possible.
 280         noprogress:       Do not print the progress bar.
 281         playliststart:    Playlist item to start at.
 282         playlistend:      Playlist item to end at.
 283         logtostderr:      Log messages to stderr instead of stdout.
 284         consoletitle:     Display progress in console window's titlebar.
 285         nopart:           Do not use temporary .part files.
 286         """
 287
 288         params = None
 289         _ies = []
 290         _pps = []
 291         _download_retcode = None
 292         _num_downloads = None
 293         _screen_file = None
 294
 295         def __init__(self, params):
 296                 """Create a FileDownloader object with the given options."""
 297                 self._ies = []
 298                 self._pps = []
 299                 self._download_retcode = 0
 300                 self._num_downloads = 0
 301                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 302                 self.params = params
 303
 304         @staticmethod
 305         def pmkdir(filename):
 306                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 307                 components = filename.split(os.sep)
 308                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 309                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 310                 for dir in aggregate:
 311                         if not os.path.exists(dir):
 312                                 os.mkdir(dir)
 313
 314         @staticmethod
 315         def format_bytes(bytes):
 316                 if bytes is None:
 317                         return 'N/A'
 318                 if type(bytes) is str:
 319                         bytes = float(bytes)
 320                 if bytes == 0.0:
 321                         exponent = 0
 322                 else:
 323                         exponent = long(math.log(bytes, 1024.0))
 324                 suffix = 'bkMGTPEZY'[exponent]
 325                 converted = float(bytes) / float(1024**exponent)
 326                 return '%.2f%s' % (converted, suffix)
 327
 328         @staticmethod
 329         def calc_percent(byte_counter, data_len):
 330                 if data_len is None:
 331                         return '---.-%'
 332                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 333
 334         @staticmethod
 335         def calc_eta(start, now, total, current):
 336                 if total is None:
 337                         return '--:--'
 338                 dif = now - start
 339                 if current == 0 or dif < 0.001: # One millisecond
 340                         return '--:--'
 341                 rate = float(current) / dif
 342                 eta = long((float(total) - float(current)) / rate)
 343                 (eta_mins, eta_secs) = divmod(eta, 60)
 344                 if eta_mins > 99:
 345                         return '--:--'
 346                 return '%02d:%02d' % (eta_mins, eta_secs)
 347
 348         @staticmethod
 349         def calc_speed(start, now, bytes):
 350                 dif = now - start
 351                 if bytes == 0 or dif < 0.001: # One millisecond
 352                         return '%10s' % '---b/s'
 353                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 354
 355         @staticmethod
 356         def best_block_size(elapsed_time, bytes):
 357                 new_min = max(bytes / 2.0, 1.0)
 358                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 359                 if elapsed_time < 0.001:
 360                         return long(new_max)
 361                 rate = bytes / elapsed_time
 362                 if rate > new_max:
 363                         return long(new_max)
 364                 if rate < new_min:
 365                         return long(new_min)
 366                 return long(rate)
 367
 368         @staticmethod
 369         def parse_bytes(bytestr):
 370                 """Parse a string indicating a byte quantity into a long integer."""
 371                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 372                 if matchobj is None:
 373                         return None
 374                 number = float(matchobj.group(1))
 375                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 376                 return long(round(number * multiplier))
 377
 378         def add_info_extractor(self, ie):
 379                 """Add an InfoExtractor object to the end of the list."""
 380                 self._ies.append(ie)
 381                 ie.set_downloader(self)
 382
 383         def add_post_processor(self, pp):
 384                 """Add a PostProcessor object to the end of the chain."""
 385                 self._pps.append(pp)
 386                 pp.set_downloader(self)
 387
 388         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 389                 """Print message to stdout if not in quiet mode."""
 390                 try:
 391                         if not self.params.get('quiet', False):
 392                                 terminator = [u'\n', u''][skip_eol]
 393                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 394                         self._screen_file.flush()
 395                 except (UnicodeEncodeError), err:
 396                         if not ignore_encoding_errors:
 397                                 raise
 398
 399         def to_stderr(self, message):
 400                 """Print message to stderr."""
 401                 print >>sys.stderr, message.encode(preferredencoding())
 402
 403         def to_cons_title(self, message):
 404                 """Set console/terminal window title to message."""
 405                 if not self.params.get('consoletitle', False):
 406                         return
 407                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 408                         # c_wchar_p() might not be necessary if `message` is
 409                         # already of type unicode()
 410                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 411                 elif 'TERM' in os.environ:
 412                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 413
 414         def fixed_template(self):
 415                 """Checks if the output template is fixed."""
 416                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 417
 418         def trouble(self, message=None):
 419                 """Determine action to take when a download problem appears.
 420
 421                 Depending on if the downloader has been configured to ignore
 422                 download errors or not, this method may throw an exception or
 423                 not when errors are found, after printing the message.
 424                 """
 425                 if message is not None:
 426                         self.to_stderr(message)
 427                 if not self.params.get('ignoreerrors', False):
 428                         raise DownloadError(message)
 429                 self._download_retcode = 1
 430
 431         def slow_down(self, start_time, byte_counter):
 432                 """Sleep if the download speed is over the rate limit."""
 433                 rate_limit = self.params.get('ratelimit', None)
 434                 if rate_limit is None or byte_counter == 0:
 435                         return
 436                 now = time.time()
 437                 elapsed = now - start_time
 438                 if elapsed <= 0.0:
 439                         return
 440                 speed = float(byte_counter) / elapsed
 441                 if speed > rate_limit:
 442                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 443
 444         def temp_name(self, filename):
 445                 """Returns a temporary filename for the given filename."""
 446                 if self.params.get('nopart', False) or filename == u'-' or \
 447                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 448                         return filename
 449                 return filename + u'.part'
 450
 451         def undo_temp_name(self, filename):
 452                 if filename.endswith(u'.part'):
 453                         return filename[:-len(u'.part')]
 454                 return filename
 455
 456         def try_rename(self, old_filename, new_filename):
 457                 try:
 458                         if old_filename == new_filename:
 459                                 return
 460                         os.rename(old_filename, new_filename)
 461                 except (IOError, OSError), err:
 462                         self.trouble(u'ERROR: unable to rename file')
 463
 464         def report_destination(self, filename):
 465                 """Report destination filename."""
 466                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 467
 468         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 469                 """Report download progress."""
 470                 if self.params.get('noprogress', False):
 471                         return
 472                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 473                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 474                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 475                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 476
 477         def report_resuming_byte(self, resume_len):
 478                 """Report attempt to resume at given byte."""
 479                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 480
 481         def report_retry(self, count, retries):
 482                 """Report retry in case of HTTP error 5xx"""
 483                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 484
 485         def report_file_already_downloaded(self, file_name):
 486                 """Report file has already been fully downloaded."""
 487                 try:
 488                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 489                 except (UnicodeEncodeError), err:
 490                         self.to_screen(u'[download] The file has already been downloaded')
 491
 492         def report_unable_to_resume(self):
 493                 """Report it was impossible to resume download."""
 494                 self.to_screen(u'[download] Unable to resume')
 495
 496         def report_finish(self):
 497                 """Report download finished."""
 498                 if self.params.get('noprogress', False):
 499                         self.to_screen(u'[download] Download completed')
 500                 else:
 501                         self.to_screen(u'')
 502
 503         def increment_downloads(self):
 504                 """Increment the ordinal that assigns a number to each file."""
 505                 self._num_downloads += 1
 506
 507         def prepare_filename(self, info_dict):
 508                 """Generate the output filename."""
 509                 try:
 510                         template_dict = dict(info_dict)
 511                         template_dict['epoch'] = unicode(long(time.time()))
 512                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 513                         filename = self.params['outtmpl'] % template_dict
 514                         return filename
 515                 except (ValueError, KeyError), err:
 516                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 517                         return None
 518
 519         def process_info(self, info_dict):
 520                 """Process a single dictionary returned by an InfoExtractor."""
 521                 filename = self.prepare_filename(info_dict)
 522                 # Do nothing else if in simulate mode
 523                 if self.params.get('simulate', False):
 524                         # Forced printings
 525                         if self.params.get('forcetitle', False):
 526                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 527                         if self.params.get('forceurl', False):
 528                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 529                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 530                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 531                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 532                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 533                         if self.params.get('forcefilename', False) and filename is not None:
 534                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 535
 536                         return
 537
 538                 if filename is None:
 539                         return
 540                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 541                         self.to_stderr(u'WARNING: file exists and will be skipped')
 542                         return
 543
 544                 try:
 545                         self.pmkdir(filename)
 546                 except (OSError, IOError), err:
 547                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 548                         return
 549
 550                 try:
 551                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 552                 except (OSError, IOError), err:
 553                         raise UnavailableVideoError
 554                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 555                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 556                         return
 557                 except (ContentTooShortError, ), err:
 558                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 559                         return
 560
 561                 if success:
 562                         try:
 563                                 self.post_process(filename, info_dict)
 564                         except (PostProcessingError), err:
 565                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 566                                 return
 567
 568         def download(self, url_list):
 569                 """Download a given list of URLs."""
 570                 if len(url_list) > 1 and self.fixed_template():
 571                         raise SameFileError(self.params['outtmpl'])
 572
 573                 for url in url_list:
 574                         suitable_found = False
 575                         for ie in self._ies:
 576                                 # Go to next InfoExtractor if not suitable
 577                                 if not ie.suitable(url):
 578                                         continue
 579
 580                                 # Suitable InfoExtractor found
 581                                 suitable_found = True
 582
 583                                 # Extract information from URL and process it
 584                                 ie.extract(url)
 585
 586                                 # Suitable InfoExtractor had been found; go to next URL
 587                                 break
 588
 589                         if not suitable_found:
 590                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 591
 592                 return self._download_retcode
 593
 594         def post_process(self, filename, ie_info):
 595                 """Run the postprocessing chain on the given file."""
 596                 info = dict(ie_info)
 597                 info['filepath'] = filename
 598                 for pp in self._pps:
 599                         info = pp.run(info)
 600                         if info is None:
 601                                 break
 602
 603         def _download_with_rtmpdump(self, filename, url, player_url):
 604                 self.report_destination(filename)
 605                 tmpfilename = self.temp_name(filename)
 606
 607                 # Check for rtmpdump first
 608                 try:
 609                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 610                 except (OSError, IOError):
 611                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 612                         return False
 613
 614                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 615                 # the connection was interrumpted and resuming appears to be
 616                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 617                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 618                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 619                 while retval == 2 or retval == 1:
 620                         prevsize = os.path.getsize(tmpfilename)
 621                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 622                         time.sleep(5.0) # This seems to be needed
 623                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 624                         cursize = os.path.getsize(tmpfilename)
 625                         if prevsize == cursize and retval == 1:
 626                                 break
 627                 if retval == 0:
 628                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 629                         self.try_rename(tmpfilename, filename)
 630                         return True
 631                 else:
 632                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 633                         return False
 634
 635         def _do_download(self, filename, url, player_url):
 636                 # Check file already present
 637                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 638                         self.report_file_already_downloaded(filename)
 639                         return True
 640
 641                 # Attempt to download using rtmpdump
 642                 if url.startswith('rtmp'):
 643                         return self._download_with_rtmpdump(filename, url, player_url)
 644
 645                 tmpfilename = self.temp_name(filename)
 646                 stream = None
 647                 open_mode = 'wb'
 648
 649                 # Do not include the Accept-Encoding header
 650                 headers = {'Youtubedl-no-compression': 'True'}
 651                 basic_request = urllib2.Request(url, None, headers)
 652                 request = urllib2.Request(url, None, headers)
 653
 654                 # Establish possible resume length
 655                 if os.path.isfile(tmpfilename):
 656                         resume_len = os.path.getsize(tmpfilename)
 657                 else:
 658                         resume_len = 0
 659
 660                 # Request parameters in case of being able to resume
 661                 if self.params.get('continuedl', False) and resume_len != 0:
 662                         self.report_resuming_byte(resume_len)
 663                         request.add_header('Range','bytes=%d-' % resume_len)
 664                         open_mode = 'ab'
 665
 666                 count = 0
 667                 retries = self.params.get('retries', 0)
 668                 while count <= retries:
 669                         # Establish connection
 670                         try:
 671                                 data = urllib2.urlopen(request)
 672                                 break
 673                         except (urllib2.HTTPError, ), err:
 674                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 675                                         # Unexpected HTTP error
 676                                         raise
 677                                 elif err.code == 416:
 678                                         # Unable to resume (requested range not satisfiable)
 679                                         try:
 680                                                 # Open the connection again without the range header
 681                                                 data = urllib2.urlopen(basic_request)
 682                                                 content_length = data.info()['Content-Length']
 683                                         except (urllib2.HTTPError, ), err:
 684                                                 if err.code < 500 or err.code >= 600:
 685                                                         raise
 686                                         else:
 687                                                 # Examine the reported length
 688                                                 if (content_length is not None and
 689                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 690                                                         # The file had already been fully downloaded.
 691                                                         # Explanation to the above condition: in issue #175 it was revealed that
 692                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 693                                                         # changing the file size slightly and causing problems for some users. So
 694                                                         # I decided to implement a suggested change and consider the file
 695                                                         # completely downloaded if the file size differs less than 100 bytes from
 696                                                         # the one in the hard drive.
 697                                                         self.report_file_already_downloaded(filename)
 698                                                         self.try_rename(tmpfilename, filename)
 699                                                         return True
 700                                                 else:
 701                                                         # The length does not match, we start the download over
 702                                                         self.report_unable_to_resume()
 703                                                         open_mode = 'wb'
 704                                                         break
 705                         # Retry
 706                         count += 1
 707                         if count <= retries:
 708                                 self.report_retry(count, retries)
 709
 710                 if count > retries:
 711                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 712                         return False
 713
 714                 data_len = data.info().get('Content-length', None)
 715                 if data_len is not None:
 716                         data_len = long(data_len) + resume_len
 717                 data_len_str = self.format_bytes(data_len)
 718                 byte_counter = 0 + resume_len
 719                 block_size = 1024
 720                 start = time.time()
 721                 while True:
 722                         # Download and write
 723                         before = time.time()
 724                         data_block = data.read(block_size)
 725                         after = time.time()
 726                         if len(data_block) == 0:
 727                                 break
 728                         byte_counter += len(data_block)
 729
 730                         # Open file just in time
 731                         if stream is None:
 732                                 try:
 733                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 734                                         filename = self.undo_temp_name(tmpfilename)
 735                                         self.report_destination(filename)
 736                                 except (OSError, IOError), err:
 737                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 738                                         return False
 739                         try:
 740                                 stream.write(data_block)
 741                         except (IOError, OSError), err:
 742                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 743                                 return False
 744                         block_size = self.best_block_size(after - before, len(data_block))
 745
 746                         # Progress message
 747                         percent_str = self.calc_percent(byte_counter, data_len)
 748                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 749                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 750                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 751
 752                         # Apply rate limit
 753                         self.slow_down(start, byte_counter - resume_len)
 754
 755                 stream.close()
 756                 self.report_finish()
 757                 if data_len is not None and byte_counter != data_len:
 758                         raise ContentTooShortError(byte_counter, long(data_len))
 759                 self.try_rename(tmpfilename, filename)
 760                 # Update file modification time
 761                 timestr = data.info().get('last-modified', None)
 762                 if timestr is not None:
 763                         filetime = timeconvert(timestr)
 764                         if filetime is not None:
 765                                 try:
 766                                         os.utime(filename,(time.time(), filetime))
 767                                 except:
 768                                         pass
 769                 return True
 770
 771 class InfoExtractor(object):
 772         """Information Extractor class.
 773
 774         Information extractors are the classes that, given a URL, extract
 775         information from the video (or videos) the URL refers to. This
 776         information includes the real video URL, the video title and simplified
 777         title, author and others. The information is stored in a dictionary
 778         which is then passed to the FileDownloader. The FileDownloader
 779         processes this information possibly downloading the video to the file
 780         system, among other possible outcomes. The dictionaries must include
 781         the following fields:
 782
 783         id:             Video identifier.
 784         url:            Final video URL.
 785         uploader:       Nickname of the video uploader.
 786         title:          Literal title.
 787         stitle:         Simplified title.
 788         ext:            Video filename extension.
 789         format:         Video format.
 790         player_url:     SWF Player URL (may be None).
 791
 792         The following fields are optional. Their primary purpose is to allow
 793         youtube-dl to serve as the backend for a video search function, such
 794         as the one in youtube2mp3.  They are only used when their respective
 795         forced printing functions are called:
 796
 797         thumbnail:      Full URL to a video thumbnail image.
 798         description:    One-line video description.
 799
 800         Subclasses of this one should re-define the _real_initialize() and
 801         _real_extract() methods, as well as the suitable() static method.
 802         Probably, they should also be instantiated and added to the main
 803         downloader.
 804         """
 805
 806         _ready = False
 807         _downloader = None
 808
 809         def __init__(self, downloader=None):
 810                 """Constructor. Receives an optional downloader."""
 811                 self._ready = False
 812                 self.set_downloader(downloader)
 813
 814         @staticmethod
 815         def suitable(url):
 816                 """Receives a URL and returns True if suitable for this IE."""
 817                 return False
 818
 819         def initialize(self):
 820                 """Initializes an instance (authentication, etc)."""
 821                 if not self._ready:
 822                         self._real_initialize()
 823                         self._ready = True
 824
 825         def extract(self, url):
 826                 """Extracts URL information and returns it in list of dicts."""
 827                 self.initialize()
 828                 return self._real_extract(url)
 829
 830         def set_downloader(self, downloader):
 831                 """Sets the downloader for this IE."""
 832                 self._downloader = downloader
 833
 834         def _real_initialize(self):
 835                 """Real initialization process. Redefine in subclasses."""
 836                 pass
 837
 838         def _real_extract(self, url):
 839                 """Real extraction process. Redefine in subclasses."""
 840                 pass
 841
 842 class YoutubeIE(InfoExtractor):
 843         """Information extractor for youtube.com."""
 844
 845         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 846         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 847         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 848         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 849         _NETRC_MACHINE = 'youtube'
 850         # Listed in order of quality
 851         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 852         _video_extensions = {
 853                 '13': '3gp',
 854                 '17': 'mp4',
 855                 '18': 'mp4',
 856                 '22': 'mp4',
 857                 '37': 'mp4',
 858                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 859                 '43': 'webm',
 860                 '45': 'webm',
 861         }
 862
 863         @staticmethod
 864         def suitable(url):
 865                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 866
 867         def report_lang(self):
 868                 """Report attempt to set language."""
 869                 self._downloader.to_screen(u'[youtube] Setting language')
 870
 871         def report_login(self):
 872                 """Report attempt to log in."""
 873                 self._downloader.to_screen(u'[youtube] Logging in')
 874
 875         def report_age_confirmation(self):
 876                 """Report attempt to confirm age."""
 877                 self._downloader.to_screen(u'[youtube] Confirming age')
 878
 879         def report_video_webpage_download(self, video_id):
 880                 """Report attempt to download video webpage."""
 881                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 882
 883         def report_video_info_webpage_download(self, video_id):
 884                 """Report attempt to download video info webpage."""
 885                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 886
 887         def report_information_extraction(self, video_id):
 888                 """Report attempt to extract video information."""
 889                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 890
 891         def report_unavailable_format(self, video_id, format):
 892                 """Report extracted video URL."""
 893                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 894
 895         def report_rtmp_download(self):
 896                 """Indicate the download will use the RTMP protocol."""
 897                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 898
 899         def _real_initialize(self):
 900                 if self._downloader is None:
 901                         return
 902
 903                 username = None
 904                 password = None
 905                 downloader_params = self._downloader.params
 906
 907                 # Attempt to use provided username and password or .netrc data
 908                 if downloader_params.get('username', None) is not None:
 909                         username = downloader_params['username']
 910                         password = downloader_params['password']
 911                 elif downloader_params.get('usenetrc', False):
 912                         try:
 913                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 914                                 if info is not None:
 915                                         username = info[0]
 916                                         password = info[2]
 917                                 else:
 918                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 919                         except (IOError, netrc.NetrcParseError), err:
 920                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 921                                 return
 922
 923                 # Set language
 924                 request = urllib2.Request(self._LANG_URL)
 925                 try:
 926                         self.report_lang()
 927                         urllib2.urlopen(request).read()
 928                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 929                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 930                         return
 931
 932                 # No authentication to be performed
 933                 if username is None:
 934                         return
 935
 936                 # Log in
 937                 login_form = {
 938                                 'current_form': 'loginForm',
 939                                 'next':         '/',
 940                                 'action_login': 'Log In',
 941                                 'username':     username,
 942                                 'password':     password,
 943                                 }
 944                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 945                 try:
 946                         self.report_login()
 947                         login_results = urllib2.urlopen(request).read()
 948                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 949                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 950                                 return
 951                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 952                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 953                         return
 954
 955                 # Confirm age
 956                 age_form = {
 957                                 'next_url':             '/',
 958                                 'action_confirm':       'Confirm',
 959                                 }
 960                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 961                 try:
 962                         self.report_age_confirmation()
 963                         age_results = urllib2.urlopen(request).read()
 964                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 965                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 966                         return
 967
 968         def _real_extract(self, url):
 969                 # Extract video id from URL
 970                 mobj = re.match(self._VALID_URL, url)
 971                 if mobj is None:
 972                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 973                         return
 974                 video_id = mobj.group(2)
 975
 976                 # Get video webpage
 977                 self.report_video_webpage_download(video_id)
 978                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
 979                 try:
 980                         video_webpage = urllib2.urlopen(request).read()
 981                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 982                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 983                         return
 984
 985                 # Attempt to extract SWF player URL
 986                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 987                 if mobj is not None:
 988                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 989                 else:
 990                         player_url = None
 991
 992                 # Get video info
 993                 self.report_video_info_webpage_download(video_id)
 994                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 995                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 996                                            % (video_id, el_type))
 997                         request = urllib2.Request(video_info_url)
 998                         try:
 999                                 video_info_webpage = urllib2.urlopen(request).read()
1000                                 video_info = parse_qs(video_info_webpage)
1001                                 if 'token' in video_info:
1002                                         break
1003                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1005                                 return
1006                 if 'token' not in video_info:
1007                         if 'reason' in video_info:
1008                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1009                         else:
1010                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1011                         return
1012
1013                 # Start extracting information
1014                 self.report_information_extraction(video_id)
1015
1016                 # uploader
1017                 if 'author' not in video_info:
1018                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1019                         return
1020                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1021
1022                 # title
1023                 if 'title' not in video_info:
1024                         self._downloader.trouble(u'ERROR: unable to extract video title')
1025                         return
1026                 video_title = urllib.unquote_plus(video_info['title'][0])
1027                 video_title = video_title.decode('utf-8')
1028                 video_title = sanitize_title(video_title)
1029
1030                 # simplified title
1031                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1032                 simple_title = simple_title.strip(ur'_')
1033
1034                 # thumbnail image
1035                 if 'thumbnail_url' not in video_info:
1036                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1037                         video_thumbnail = ''
1038                 else:   # don't panic if we can't find it
1039                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1040
1041                 # upload date
1042                 upload_date = u'NA'
1043                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1044                 if mobj is not None:
1045                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1046                         format_expressions = ['%d %B %Y', '%B %d %Y']
1047                         for expression in format_expressions:
1048                                 try:
1049                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1050                                 except:
1051                                         pass
1052
1053                 # description
1054                 video_description = 'No description available.'
1055                 if self._downloader.params.get('forcedescription', False):
1056                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1057                         if mobj is not None:
1058                                 video_description = mobj.group(1)
1059
1060                 # token
1061                 video_token = urllib.unquote_plus(video_info['token'][0])
1062
1063                 # Decide which formats to download
1064                 req_format = self._downloader.params.get('format', None)
1065
1066                 if 'fmt_url_map' in video_info:
1067                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1068                         format_limit = self._downloader.params.get('format_limit', None)
1069                         if format_limit is not None and format_limit in self._available_formats:
1070                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1071                         else:
1072                                 format_list = self._available_formats
1073                         existing_formats = [x for x in format_list if x in url_map]
1074                         if len(existing_formats) == 0:
1075                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1076                                 return
1077                         if req_format is None:
1078                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1079                         elif req_format == '-1':
1080                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1081                         else:
1082                                 # Specific format
1083                                 if req_format not in url_map:
1084                                         self._downloader.trouble(u'ERROR: requested format not available')
1085                                         return
1086                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1087
1088                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1089                         self.report_rtmp_download()
1090                         video_url_list = [(None, video_info['conn'][0])]
1091
1092                 else:
1093                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1094                         return
1095
1096                 for format_param, video_real_url in video_url_list:
1097                         # At this point we have a new video
1098                         self._downloader.increment_downloads()
1099
1100                         # Extension
1101                         video_extension = self._video_extensions.get(format_param, 'flv')
1102
1103                         # Find the video URL in fmt_url_map or conn paramters
1104                         try:
1105                                 # Process video information
1106                                 self._downloader.process_info({
1107                                         'id':           video_id.decode('utf-8'),
1108                                         'url':          video_real_url.decode('utf-8'),
1109                                         'uploader':     video_uploader.decode('utf-8'),
1110                                         'upload_date':  upload_date,
1111                                         'title':        video_title,
1112                                         'stitle':       simple_title,
1113                                         'ext':          video_extension.decode('utf-8'),
1114                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1115                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1116                                         'description':  video_description.decode('utf-8'),
1117                                         'player_url':   player_url,
1118                                 })
1119                         except UnavailableVideoError, err:
1120                                 self._downloader.trouble(u'\nERROR: unable to download video')
1121
1122
1123 class MetacafeIE(InfoExtractor):
1124         """Information Extractor for metacafe.com."""
1125
1126         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1127         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1128         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1129         _youtube_ie = None
1130
1131         def __init__(self, youtube_ie, downloader=None):
1132                 InfoExtractor.__init__(self, downloader)
1133                 self._youtube_ie = youtube_ie
1134
1135         @staticmethod
1136         def suitable(url):
1137                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1138
1139         def report_disclaimer(self):
1140                 """Report disclaimer retrieval."""
1141                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1142
1143         def report_age_confirmation(self):
1144                 """Report attempt to confirm age."""
1145                 self._downloader.to_screen(u'[metacafe] Confirming age')
1146
1147         def report_download_webpage(self, video_id):
1148                 """Report webpage download."""
1149                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1150
1151         def report_extraction(self, video_id):
1152                 """Report information extraction."""
1153                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1154
1155         def _real_initialize(self):
1156                 # Retrieve disclaimer
1157                 request = urllib2.Request(self._DISCLAIMER)
1158                 try:
1159                         self.report_disclaimer()
1160                         disclaimer = urllib2.urlopen(request).read()
1161                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1163                         return
1164
1165                 # Confirm age
1166                 disclaimer_form = {
1167                         'filters': '0',
1168                         'submit': "Continue - I'm over 18",
1169                         }
1170                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1171                 try:
1172                         self.report_age_confirmation()
1173                         disclaimer = urllib2.urlopen(request).read()
1174                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1175                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1176                         return
1177
1178         def _real_extract(self, url):
1179                 # Extract id and simplified title from URL
1180                 mobj = re.match(self._VALID_URL, url)
1181                 if mobj is None:
1182                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1183                         return
1184
1185                 video_id = mobj.group(1)
1186
1187                 # Check if video comes from YouTube
1188                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1189                 if mobj2 is not None:
1190                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1191                         return
1192
1193                 # At this point we have a new video
1194                 self._downloader.increment_downloads()
1195
1196                 simple_title = mobj.group(2).decode('utf-8')
1197
1198                 # Retrieve video webpage to extract further information
1199                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1200                 try:
1201                         self.report_download_webpage(video_id)
1202                         webpage = urllib2.urlopen(request).read()
1203                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1204                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1205                         return
1206
1207                 # Extract URL, uploader and title from webpage
1208                 self.report_extraction(video_id)
1209                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1210                 if mobj is not None:
1211                         mediaURL = urllib.unquote(mobj.group(1))
1212                         video_extension = mediaURL[-3:]
1213
1214                         # Extract gdaKey if available
1215                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1216                         if mobj is None:
1217                                 video_url = mediaURL
1218                         else:
1219                                 gdaKey = mobj.group(1)
1220                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1221                 else:
1222                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1223                         if mobj is None:
1224                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1225                                 return
1226                         vardict = parse_qs(mobj.group(1))
1227                         if 'mediaData' not in vardict:
1228                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1229                                 return
1230                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1231                         if mobj is None:
1232                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1233                                 return
1234                         mediaURL = mobj.group(1).replace('\\/', '/')
1235                         video_extension = mediaURL[-3:]
1236                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1237
1238                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1239                 if mobj is None:
1240                         self._downloader.trouble(u'ERROR: unable to extract title')
1241                         return
1242                 video_title = mobj.group(1).decode('utf-8')
1243                 video_title = sanitize_title(video_title)
1244
1245                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1246                 if mobj is None:
1247                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1248                         return
1249                 video_uploader = mobj.group(1)
1250
1251                 try:
1252                         # Process video information
1253                         self._downloader.process_info({
1254                                 'id':           video_id.decode('utf-8'),
1255                                 'url':          video_url.decode('utf-8'),
1256                                 'uploader':     video_uploader.decode('utf-8'),
1257                                 'upload_date':  u'NA',
1258                                 'title':        video_title,
1259                                 'stitle':       simple_title,
1260                                 'ext':          video_extension.decode('utf-8'),
1261                                 'format':       u'NA',
1262                                 'player_url':   None,
1263                         })
1264                 except UnavailableVideoError:
1265                         self._downloader.trouble(u'\nERROR: unable to download video')
1266
1267
1268 class DailymotionIE(InfoExtractor):
1269         """Information Extractor for Dailymotion"""
1270
1271         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1272
1273         def __init__(self, downloader=None):
1274                 InfoExtractor.__init__(self, downloader)
1275
1276         @staticmethod
1277         def suitable(url):
1278                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1279
1280         def report_download_webpage(self, video_id):
1281                 """Report webpage download."""
1282                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1283
1284         def report_extraction(self, video_id):
1285                 """Report information extraction."""
1286                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1287
1288         def _real_initialize(self):
1289                 return
1290
1291         def _real_extract(self, url):
1292                 # Extract id and simplified title from URL
1293                 mobj = re.match(self._VALID_URL, url)
1294                 if mobj is None:
1295                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1296                         return
1297
1298                 # At this point we have a new video
1299                 self._downloader.increment_downloads()
1300                 video_id = mobj.group(1)
1301
1302                 simple_title = mobj.group(2).decode('utf-8')
1303                 video_extension = 'flv'
1304
1305                 # Retrieve video webpage to extract further information
1306                 request = urllib2.Request(url)
1307                 try:
1308                         self.report_download_webpage(video_id)
1309                         webpage = urllib2.urlopen(request).read()
1310                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1311                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1312                         return
1313
1314                 # Extract URL, uploader and title from webpage
1315                 self.report_extraction(video_id)
1316                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1317                 if mobj is None:
1318                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1319                         return
1320                 mediaURL = urllib.unquote(mobj.group(1))
1321
1322                 # if needed add http://www.dailymotion.com/ if relative URL
1323
1324                 video_url = mediaURL
1325
1326                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1327                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1328                 if mobj is None:
1329                         self._downloader.trouble(u'ERROR: unable to extract title')
1330                         return
1331                 video_title = mobj.group(1).decode('utf-8')
1332                 video_title = sanitize_title(video_title)
1333
1334                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1335                 if mobj is None:
1336                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1337                         return
1338                 video_uploader = mobj.group(1)
1339
1340                 try:
1341                         # Process video information
1342                         self._downloader.process_info({
1343                                 'id':           video_id.decode('utf-8'),
1344                                 'url':          video_url.decode('utf-8'),
1345                                 'uploader':     video_uploader.decode('utf-8'),
1346                                 'upload_date':  u'NA',
1347                                 'title':        video_title,
1348                                 'stitle':       simple_title,
1349                                 'ext':          video_extension.decode('utf-8'),
1350                                 'format':       u'NA',
1351                                 'player_url':   None,
1352                         })
1353                 except UnavailableVideoError:
1354                         self._downloader.trouble(u'\nERROR: unable to download video')
1355
1356 class GoogleIE(InfoExtractor):
1357         """Information extractor for video.google.com."""
1358
1359         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1360
1361         def __init__(self, downloader=None):
1362                 InfoExtractor.__init__(self, downloader)
1363
1364         @staticmethod
1365         def suitable(url):
1366                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1367
1368         def report_download_webpage(self, video_id):
1369                 """Report webpage download."""
1370                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1371
1372         def report_extraction(self, video_id):
1373                 """Report information extraction."""
1374                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1375
1376         def _real_initialize(self):
1377                 return
1378
1379         def _real_extract(self, url):
1380                 # Extract id from URL
1381                 mobj = re.match(self._VALID_URL, url)
1382                 if mobj is None:
1383                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384                         return
1385
1386                 # At this point we have a new video
1387                 self._downloader.increment_downloads()
1388                 video_id = mobj.group(1)
1389
1390                 video_extension = 'mp4'
1391
1392                 # Retrieve video webpage to extract further information
1393                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1394                 try:
1395                         self.report_download_webpage(video_id)
1396                         webpage = urllib2.urlopen(request).read()
1397                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1398                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1399                         return
1400
1401                 # Extract URL, uploader, and title from webpage
1402                 self.report_extraction(video_id)
1403                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1404                 if mobj is None:
1405                         video_extension = 'flv'
1406                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1407                 if mobj is None:
1408                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1409                         return
1410                 mediaURL = urllib.unquote(mobj.group(1))
1411                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1412                 mediaURL = mediaURL.replace('\\x26', '\x26')
1413
1414                 video_url = mediaURL
1415
1416                 mobj = re.search(r'<title>(.*)</title>', webpage)
1417                 if mobj is None:
1418                         self._downloader.trouble(u'ERROR: unable to extract title')
1419                         return
1420                 video_title = mobj.group(1).decode('utf-8')
1421                 video_title = sanitize_title(video_title)
1422                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1423
1424                 # Extract video description
1425                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1426                 if mobj is None:
1427                         self._downloader.trouble(u'ERROR: unable to extract video description')
1428                         return
1429                 video_description = mobj.group(1).decode('utf-8')
1430                 if not video_description:
1431                         video_description = 'No description available.'
1432
1433                 # Extract video thumbnail
1434                 if self._downloader.params.get('forcethumbnail', False):
1435                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1436                         try:
1437                                 webpage = urllib2.urlopen(request).read()
1438                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1439                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1440                                 return
1441                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1442                         if mobj is None:
1443                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1444                                 return
1445                         video_thumbnail = mobj.group(1)
1446                 else:   # we need something to pass to process_info
1447                         video_thumbnail = ''
1448
1449
1450                 try:
1451                         # Process video information
1452                         self._downloader.process_info({
1453                                 'id':           video_id.decode('utf-8'),
1454                                 'url':          video_url.decode('utf-8'),
1455                                 'uploader':     u'NA',
1456                                 'upload_date':  u'NA',
1457                                 'title':        video_title,
1458                                 'stitle':       simple_title,
1459                                 'ext':          video_extension.decode('utf-8'),
1460                                 'format':       u'NA',
1461                                 'player_url':   None,
1462                         })
1463                 except UnavailableVideoError:
1464                         self._downloader.trouble(u'\nERROR: unable to download video')
1465
1466
1467 class PhotobucketIE(InfoExtractor):
1468         """Information extractor for photobucket.com."""
1469
1470         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1471
1472         def __init__(self, downloader=None):
1473                 InfoExtractor.__init__(self, downloader)
1474
1475         @staticmethod
1476         def suitable(url):
1477                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1478
1479         def report_download_webpage(self, video_id):
1480                 """Report webpage download."""
1481                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1482
1483         def report_extraction(self, video_id):
1484                 """Report information extraction."""
1485                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1486
1487         def _real_initialize(self):
1488                 return
1489
1490         def _real_extract(self, url):
1491                 # Extract id from URL
1492                 mobj = re.match(self._VALID_URL, url)
1493                 if mobj is None:
1494                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1495                         return
1496
1497                 # At this point we have a new video
1498                 self._downloader.increment_downloads()
1499                 video_id = mobj.group(1)
1500
1501                 video_extension = 'flv'
1502
1503                 # Retrieve video webpage to extract further information
1504                 request = urllib2.Request(url)
1505                 try:
1506                         self.report_download_webpage(video_id)
1507                         webpage = urllib2.urlopen(request).read()
1508                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1509                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1510                         return
1511
1512                 # Extract URL, uploader, and title from webpage
1513                 self.report_extraction(video_id)
1514                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1515                 if mobj is None:
1516                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1517                         return
1518                 mediaURL = urllib.unquote(mobj.group(1))
1519
1520                 video_url = mediaURL
1521
1522                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1523                 if mobj is None:
1524                         self._downloader.trouble(u'ERROR: unable to extract title')
1525                         return
1526                 video_title = mobj.group(1).decode('utf-8')
1527                 video_title = sanitize_title(video_title)
1528                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1529
1530                 video_uploader = mobj.group(2).decode('utf-8')
1531
1532                 try:
1533                         # Process video information
1534                         self._downloader.process_info({
1535                                 'id':           video_id.decode('utf-8'),
1536                                 'url':          video_url.decode('utf-8'),
1537                                 'uploader':     video_uploader,
1538                                 'upload_date':  u'NA',
1539                                 'title':        video_title,
1540                                 'stitle':       simple_title,
1541                                 'ext':          video_extension.decode('utf-8'),
1542                                 'format':       u'NA',
1543                                 'player_url':   None,
1544                         })
1545                 except UnavailableVideoError:
1546                         self._downloader.trouble(u'\nERROR: unable to download video')
1547
1548
1549 class YahooIE(InfoExtractor):
1550         """Information extractor for video.yahoo.com."""
1551
1552         # _VALID_URL matches all Yahoo! Video URLs
1553         # _VPAGE_URL matches only the extractable '/watch/' URLs
1554         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1555         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1556
1557         def __init__(self, downloader=None):
1558                 InfoExtractor.__init__(self, downloader)
1559
1560         @staticmethod
1561         def suitable(url):
1562                 return (re.match(YahooIE._VALID_URL, url) is not None)
1563
1564         def report_download_webpage(self, video_id):
1565                 """Report webpage download."""
1566                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1567
1568         def report_extraction(self, video_id):
1569                 """Report information extraction."""
1570                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1571
1572         def _real_initialize(self):
1573                 return
1574
1575         def _real_extract(self, url, new_video=True):
1576                 # Extract ID from URL
1577                 mobj = re.match(self._VALID_URL, url)
1578                 if mobj is None:
1579                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1580                         return
1581
1582                 # At this point we have a new video
1583                 self._downloader.increment_downloads()
1584                 video_id = mobj.group(2)
1585                 video_extension = 'flv'
1586
1587                 # Rewrite valid but non-extractable URLs as
1588                 # extractable English language /watch/ URLs
1589                 if re.match(self._VPAGE_URL, url) is None:
1590                         request = urllib2.Request(url)
1591                         try:
1592                                 webpage = urllib2.urlopen(request).read()
1593                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1595                                 return
1596
1597                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1598                         if mobj is None:
1599                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1600                                 return
1601                         yahoo_id = mobj.group(1)
1602
1603                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1604                         if mobj is None:
1605                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1606                                 return
1607                         yahoo_vid = mobj.group(1)
1608
1609                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1610                         return self._real_extract(url, new_video=False)
1611
1612                 # Retrieve video webpage to extract further information
1613                 request = urllib2.Request(url)
1614                 try:
1615                         self.report_download_webpage(video_id)
1616                         webpage = urllib2.urlopen(request).read()
1617                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1618                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1619                         return
1620
1621                 # Extract uploader and title from webpage
1622                 self.report_extraction(video_id)
1623                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1624                 if mobj is None:
1625                         self._downloader.trouble(u'ERROR: unable to extract video title')
1626                         return
1627                 video_title = mobj.group(1).decode('utf-8')
1628                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1629
1630                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1631                 if mobj is None:
1632                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1633                         return
1634                 video_uploader = mobj.group(1).decode('utf-8')
1635
1636                 # Extract video thumbnail
1637                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1638                 if mobj is None:
1639                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1640                         return
1641                 video_thumbnail = mobj.group(1).decode('utf-8')
1642
1643                 # Extract video description
1644                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1645                 if mobj is None:
1646                         self._downloader.trouble(u'ERROR: unable to extract video description')
1647                         return
1648                 video_description = mobj.group(1).decode('utf-8')
1649                 if not video_description: video_description = 'No description available.'
1650
1651                 # Extract video height and width
1652                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1653                 if mobj is None:
1654                         self._downloader.trouble(u'ERROR: unable to extract video height')
1655                         return
1656                 yv_video_height = mobj.group(1)
1657
1658                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1659                 if mobj is None:
1660                         self._downloader.trouble(u'ERROR: unable to extract video width')
1661                         return
1662                 yv_video_width = mobj.group(1)
1663
1664                 # Retrieve video playlist to extract media URL
1665                 # I'm not completely sure what all these options are, but we
1666                 # seem to need most of them, otherwise the server sends a 401.
1667                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1668                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1669                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1670                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1671                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1672                 try:
1673                         self.report_download_webpage(video_id)
1674                         webpage = urllib2.urlopen(request).read()
1675                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677                         return
1678
1679                 # Extract media URL from playlist XML
1680                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1681                 if mobj is None:
1682                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1683                         return
1684                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1685                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1686
1687                 try:
1688                         # Process video information
1689                         self._downloader.process_info({
1690                                 'id':           video_id.decode('utf-8'),
1691                                 'url':          video_url,
1692                                 'uploader':     video_uploader,
1693                                 'upload_date':  u'NA',
1694                                 'title':        video_title,
1695                                 'stitle':       simple_title,
1696                                 'ext':          video_extension.decode('utf-8'),
1697                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1698                                 'description':  video_description,
1699                                 'thumbnail':    video_thumbnail,
1700                                 'description':  video_description,
1701                                 'player_url':   None,
1702                         })
1703                 except UnavailableVideoError:
1704                         self._downloader.trouble(u'\nERROR: unable to download video')
1705
1706
1707 class GenericIE(InfoExtractor):
1708         """Generic last-resort information extractor."""
1709
1710         def __init__(self, downloader=None):
1711                 InfoExtractor.__init__(self, downloader)
1712
1713         @staticmethod
1714         def suitable(url):
1715                 return True
1716
1717         def report_download_webpage(self, video_id):
1718                 """Report webpage download."""
1719                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1720                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1721
1722         def report_extraction(self, video_id):
1723                 """Report information extraction."""
1724                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1725
1726         def _real_initialize(self):
1727                 return
1728
1729         def _real_extract(self, url):
1730                 # At this point we have a new video
1731                 self._downloader.increment_downloads()
1732
1733                 video_id = url.split('/')[-1]
1734                 request = urllib2.Request(url)
1735                 try:
1736                         self.report_download_webpage(video_id)
1737                         webpage = urllib2.urlopen(request).read()
1738                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1739                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1740                         return
1741                 except ValueError, err:
1742                         # since this is the last-resort InfoExtractor, if
1743                         # this error is thrown, it'll be thrown here
1744                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1745                         return
1746
1747                 self.report_extraction(video_id)
1748                 # Start with something easy: JW Player in SWFObject
1749                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1750                 if mobj is None:
1751                         # Broaden the search a little bit
1752                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1753                 if mobj is None:
1754                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1755                         return
1756
1757                 # It's possible that one of the regexes
1758                 # matched, but returned an empty group:
1759                 if mobj.group(1) is None:
1760                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1761                         return
1762
1763                 video_url = urllib.unquote(mobj.group(1))
1764                 video_id  = os.path.basename(video_url)
1765
1766                 # here's a fun little line of code for you:
1767                 video_extension = os.path.splitext(video_id)[1][1:]
1768                 video_id        = os.path.splitext(video_id)[0]
1769
1770                 # it's tempting to parse this further, but you would
1771                 # have to take into account all the variations like
1772                 #   Video Title - Site Name
1773                 #   Site Name | Video Title
1774                 #   Video Title - Tagline | Site Name
1775                 # and so on and so forth; it's just not practical
1776                 mobj = re.search(r'<title>(.*)</title>', webpage)
1777                 if mobj is None:
1778                         self._downloader.trouble(u'ERROR: unable to extract title')
1779                         return
1780                 video_title = mobj.group(1).decode('utf-8')
1781                 video_title = sanitize_title(video_title)
1782                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1783
1784                 # video uploader is domain name
1785                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1786                 if mobj is None:
1787                         self._downloader.trouble(u'ERROR: unable to extract title')
1788                         return
1789                 video_uploader = mobj.group(1).decode('utf-8')
1790
1791                 try:
1792                         # Process video information
1793                         self._downloader.process_info({
1794                                 'id':           video_id.decode('utf-8'),
1795                                 'url':          video_url.decode('utf-8'),
1796                                 'uploader':     video_uploader,
1797                                 'upload_date':  u'NA',
1798                                 'title':        video_title,
1799                                 'stitle':       simple_title,
1800                                 'ext':          video_extension.decode('utf-8'),
1801                                 'format':       u'NA',
1802                                 'player_url':   None,
1803                         })
1804                 except UnavailableVideoError, err:
1805                         self._downloader.trouble(u'\nERROR: unable to download video')
1806
1807
1808 class YoutubeSearchIE(InfoExtractor):
1809         """Information Extractor for YouTube search queries."""
1810         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1811         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1812         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1813         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1814         _youtube_ie = None
1815         _max_youtube_results = 1000
1816
1817         def __init__(self, youtube_ie, downloader=None):
1818                 InfoExtractor.__init__(self, downloader)
1819                 self._youtube_ie = youtube_ie
1820
1821         @staticmethod
1822         def suitable(url):
1823                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1824
1825         def report_download_page(self, query, pagenum):
1826                 """Report attempt to download playlist page with given number."""
1827                 query = query.decode(preferredencoding())
1828                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1829
1830         def _real_initialize(self):
1831                 self._youtube_ie.initialize()
1832
1833         def _real_extract(self, query):
1834                 mobj = re.match(self._VALID_QUERY, query)
1835                 if mobj is None:
1836                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1837                         return
1838
1839                 prefix, query = query.split(':')
1840                 prefix = prefix[8:]
1841                 query  = query.encode('utf-8')
1842                 if prefix == '':
1843                         self._download_n_results(query, 1)
1844                         return
1845                 elif prefix == 'all':
1846                         self._download_n_results(query, self._max_youtube_results)
1847                         return
1848                 else:
1849                         try:
1850                                 n = long(prefix)
1851                                 if n <= 0:
1852                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1853                                         return
1854                                 elif n > self._max_youtube_results:
1855                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1856                                         n = self._max_youtube_results
1857                                 self._download_n_results(query, n)
1858                                 return
1859                         except ValueError: # parsing prefix as integer fails
1860                                 self._download_n_results(query, 1)
1861                                 return
1862
1863         def _download_n_results(self, query, n):
1864                 """Downloads a specified number of results for a query"""
1865
1866                 video_ids = []
1867                 already_seen = set()
1868                 pagenum = 1
1869
1870                 while True:
1871                         self.report_download_page(query, pagenum)
1872                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1873                         request = urllib2.Request(result_url)
1874                         try:
1875                                 page = urllib2.urlopen(request).read()
1876                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1877                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1878                                 return
1879
1880                         # Extract video identifiers
1881                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1882                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1883                                 if video_id not in already_seen:
1884                                         video_ids.append(video_id)
1885                                         already_seen.add(video_id)
1886                                         if len(video_ids) == n:
1887                                                 # Specified n videos reached
1888                                                 for id in video_ids:
1889                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1890                                                 return
1891
1892                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1893                                 for id in video_ids:
1894                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1895                                 return
1896
1897                         pagenum = pagenum + 1
1898
1899 class GoogleSearchIE(InfoExtractor):
1900         """Information Extractor for Google Video search queries."""
1901         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1902         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1903         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1904         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1905         _google_ie = None
1906         _max_google_results = 1000
1907
1908         def __init__(self, google_ie, downloader=None):
1909                 InfoExtractor.__init__(self, downloader)
1910                 self._google_ie = google_ie
1911
1912         @staticmethod
1913         def suitable(url):
1914                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1915
1916         def report_download_page(self, query, pagenum):
1917                 """Report attempt to download playlist page with given number."""
1918                 query = query.decode(preferredencoding())
1919                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1920
1921         def _real_initialize(self):
1922                 self._google_ie.initialize()
1923
1924         def _real_extract(self, query):
1925                 mobj = re.match(self._VALID_QUERY, query)
1926                 if mobj is None:
1927                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1928                         return
1929
1930                 prefix, query = query.split(':')
1931                 prefix = prefix[8:]
1932                 query  = query.encode('utf-8')
1933                 if prefix == '':
1934                         self._download_n_results(query, 1)
1935                         return
1936                 elif prefix == 'all':
1937                         self._download_n_results(query, self._max_google_results)
1938                         return
1939                 else:
1940                         try:
1941                                 n = long(prefix)
1942                                 if n <= 0:
1943                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1944                                         return
1945                                 elif n > self._max_google_results:
1946                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1947                                         n = self._max_google_results
1948                                 self._download_n_results(query, n)
1949                                 return
1950                         except ValueError: # parsing prefix as integer fails
1951                                 self._download_n_results(query, 1)
1952                                 return
1953
1954         def _download_n_results(self, query, n):
1955                 """Downloads a specified number of results for a query"""
1956
1957                 video_ids = []
1958                 already_seen = set()
1959                 pagenum = 1
1960
1961                 while True:
1962                         self.report_download_page(query, pagenum)
1963                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1964                         request = urllib2.Request(result_url)
1965                         try:
1966                                 page = urllib2.urlopen(request).read()
1967                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1968                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1969                                 return
1970
1971                         # Extract video identifiers
1972                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1973                                 video_id = mobj.group(1)
1974                                 if video_id not in already_seen:
1975                                         video_ids.append(video_id)
1976                                         already_seen.add(video_id)
1977                                         if len(video_ids) == n:
1978                                                 # Specified n videos reached
1979                                                 for id in video_ids:
1980                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1981                                                 return
1982
1983                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1984                                 for id in video_ids:
1985                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1986                                 return
1987
1988                         pagenum = pagenum + 1
1989
1990 class YahooSearchIE(InfoExtractor):
1991         """Information Extractor for Yahoo! Video search queries."""
1992         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1993         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1994         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1995         _MORE_PAGES_INDICATOR = r'\s*Next'
1996         _yahoo_ie = None
1997         _max_yahoo_results = 1000
1998
1999         def __init__(self, yahoo_ie, downloader=None):
2000                 InfoExtractor.__init__(self, downloader)
2001                 self._yahoo_ie = yahoo_ie
2002
2003         @staticmethod
2004         def suitable(url):
2005                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2006
2007         def report_download_page(self, query, pagenum):
2008                 """Report attempt to download playlist page with given number."""
2009                 query = query.decode(preferredencoding())
2010                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2011
2012         def _real_initialize(self):
2013                 self._yahoo_ie.initialize()
2014
2015         def _real_extract(self, query):
2016                 mobj = re.match(self._VALID_QUERY, query)
2017                 if mobj is None:
2018                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2019                         return
2020
2021                 prefix, query = query.split(':')
2022                 prefix = prefix[8:]
2023                 query  = query.encode('utf-8')
2024                 if prefix == '':
2025                         self._download_n_results(query, 1)
2026                         return
2027                 elif prefix == 'all':
2028                         self._download_n_results(query, self._max_yahoo_results)
2029                         return
2030                 else:
2031                         try:
2032                                 n = long(prefix)
2033                                 if n <= 0:
2034                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2035                                         return
2036                                 elif n > self._max_yahoo_results:
2037                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2038                                         n = self._max_yahoo_results
2039                                 self._download_n_results(query, n)
2040                                 return
2041                         except ValueError: # parsing prefix as integer fails
2042                                 self._download_n_results(query, 1)
2043                                 return
2044
2045         def _download_n_results(self, query, n):
2046                 """Downloads a specified number of results for a query"""
2047
2048                 video_ids = []
2049                 already_seen = set()
2050                 pagenum = 1
2051
2052                 while True:
2053                         self.report_download_page(query, pagenum)
2054                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2055                         request = urllib2.Request(result_url)
2056                         try:
2057                                 page = urllib2.urlopen(request).read()
2058                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2059                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2060                                 return
2061
2062                         # Extract video identifiers
2063                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2064                                 video_id = mobj.group(1)
2065                                 if video_id not in already_seen:
2066                                         video_ids.append(video_id)
2067                                         already_seen.add(video_id)
2068                                         if len(video_ids) == n:
2069                                                 # Specified n videos reached
2070                                                 for id in video_ids:
2071                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2072                                                 return
2073
2074                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2075                                 for id in video_ids:
2076                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2077                                 return
2078
2079                         pagenum = pagenum + 1
2080
2081 class YoutubePlaylistIE(InfoExtractor):
2082         """Information Extractor for YouTube playlists."""
2083
2084         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2085         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2086         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2087         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2088         _youtube_ie = None
2089
2090         def __init__(self, youtube_ie, downloader=None):
2091                 InfoExtractor.__init__(self, downloader)
2092                 self._youtube_ie = youtube_ie
2093
2094         @staticmethod
2095         def suitable(url):
2096                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2097
2098         def report_download_page(self, playlist_id, pagenum):
2099                 """Report attempt to download playlist page with given number."""
2100                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2101
2102         def _real_initialize(self):
2103                 self._youtube_ie.initialize()
2104
2105         def _real_extract(self, url):
2106                 # Extract playlist id
2107                 mobj = re.match(self._VALID_URL, url)
2108                 if mobj is None:
2109                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2110                         return
2111
2112                 # Download playlist pages
2113                 playlist_id = mobj.group(1)
2114                 video_ids = []
2115                 pagenum = 1
2116
2117                 while True:
2118                         self.report_download_page(playlist_id, pagenum)
2119                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2120                         try:
2121                                 page = urllib2.urlopen(request).read()
2122                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2123                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2124                                 return
2125
2126                         # Extract video identifiers
2127                         ids_in_page = []
2128                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2129                                 if mobj.group(1) not in ids_in_page:
2130                                         ids_in_page.append(mobj.group(1))
2131                         video_ids.extend(ids_in_page)
2132
2133                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2134                                 break
2135                         pagenum = pagenum + 1
2136
2137                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2138                 playlistend = self._downloader.params.get('playlistend', -1)
2139                 video_ids = video_ids[playliststart:playlistend]
2140
2141                 for id in video_ids:
2142                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2143                 return
2144
2145 class YoutubeUserIE(InfoExtractor):
2146         """Information Extractor for YouTube users."""
2147
2148         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2149         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2150         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2151         _youtube_ie = None
2152
2153         def __init__(self, youtube_ie, downloader=None):
2154                 InfoExtractor.__init__(self, downloader)
2155                 self._youtube_ie = youtube_ie
2156
2157         @staticmethod
2158         def suitable(url):
2159                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2160
2161         def report_download_page(self, username):
2162                 """Report attempt to download user page."""
2163                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2164
2165         def _real_initialize(self):
2166                 self._youtube_ie.initialize()
2167
2168         def _real_extract(self, url):
2169                 # Extract username
2170                 mobj = re.match(self._VALID_URL, url)
2171                 if mobj is None:
2172                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2173                         return
2174
2175                 # Download user page
2176                 username = mobj.group(1)
2177                 video_ids = []
2178                 pagenum = 1
2179
2180                 self.report_download_page(username)
2181                 request = urllib2.Request(self._TEMPLATE_URL % (username))
2182                 try:
2183                         page = urllib2.urlopen(request).read()
2184                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2185                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2186                         return
2187
2188                 # Extract video identifiers
2189                 ids_in_page = []
2190
2191                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2192                         if mobj.group(1) not in ids_in_page:
2193                                 ids_in_page.append(mobj.group(1))
2194                 video_ids.extend(ids_in_page)
2195
2196                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2197                 playlistend = self._downloader.params.get('playlistend', -1)
2198                 video_ids = video_ids[playliststart:playlistend]
2199
2200                 for id in video_ids:
2201                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2202                 return
2203
2204 class DepositFilesIE(InfoExtractor):
2205         """Information extractor for depositfiles.com"""
2206
2207         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2208
2209         def __init__(self, downloader=None):
2210                 InfoExtractor.__init__(self, downloader)
2211
2212         @staticmethod
2213         def suitable(url):
2214                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2215
2216         def report_download_webpage(self, file_id):
2217                 """Report webpage download."""
2218                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2219
2220         def report_extraction(self, file_id):
2221                 """Report information extraction."""
2222                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2223
2224         def _real_initialize(self):
2225                 return
2226
2227         def _real_extract(self, url):
2228                 # At this point we have a new file
2229                 self._downloader.increment_downloads()
2230
2231                 file_id = url.split('/')[-1]
2232                 # Rebuild url in english locale
2233                 url = 'http://depositfiles.com/en/files/' + file_id
2234
2235                 # Retrieve file webpage with 'Free download' button pressed
2236                 free_download_indication = { 'gateway_result' : '1' }
2237                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2238                 try:
2239                         self.report_download_webpage(file_id)
2240                         webpage = urllib2.urlopen(request).read()
2241                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2242                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2243                         return
2244
2245                 # Search for the real file URL
2246                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2247                 if (mobj is None) or (mobj.group(1) is None):
2248                         # Try to figure out reason of the error.
2249                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2250                         if (mobj is not None) and (mobj.group(1) is not None):
2251                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2252                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2253                         else:
2254                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2255                         return
2256
2257                 file_url = mobj.group(1)
2258                 file_extension = os.path.splitext(file_url)[1][1:]
2259
2260                 # Search for file title
2261                 mobj = re.search(r'<b title="(.*?)">', webpage)
2262                 if mobj is None:
2263                         self._downloader.trouble(u'ERROR: unable to extract title')
2264                         return
2265                 file_title = mobj.group(1).decode('utf-8')
2266
2267                 try:
2268                         # Process file information
2269                         self._downloader.process_info({
2270                                 'id':           file_id.decode('utf-8'),
2271                                 'url':          file_url.decode('utf-8'),
2272                                 'uploader':     u'NA',
2273                                 'upload_date':  u'NA',
2274                                 'title':        file_title,
2275                                 'stitle':       file_title,
2276                                 'ext':          file_extension.decode('utf-8'),
2277                                 'format':       u'NA',
2278                                 'player_url':   None,
2279                         })
2280                 except UnavailableVideoError, err:
2281                         self._downloader.trouble(u'ERROR: unable to download file')
2282
2283 class PostProcessor(object):
2284         """Post Processor class.
2285
2286         PostProcessor objects can be added to downloaders with their
2287         add_post_processor() method. When the downloader has finished a
2288         successful download, it will take its internal chain of PostProcessors
2289         and start calling the run() method on each one of them, first with
2290         an initial argument and then with the returned value of the previous
2291         PostProcessor.
2292
2293         The chain will be stopped if one of them ever returns None or the end
2294         of the chain is reached.
2295
2296         PostProcessor objects follow a "mutual registration" process similar
2297         to InfoExtractor objects.
2298         """
2299
2300         _downloader = None
2301
2302         def __init__(self, downloader=None):
2303                 self._downloader = downloader
2304
2305         def set_downloader(self, downloader):
2306                 """Sets the downloader for this PP."""
2307                 self._downloader = downloader
2308
2309         def run(self, information):
2310                 """Run the PostProcessor.
2311
2312                 The "information" argument is a dictionary like the ones
2313                 composed by InfoExtractors. The only difference is that this
2314                 one has an extra field called "filepath" that points to the
2315                 downloaded file.
2316
2317                 When this method returns None, the postprocessing chain is
2318                 stopped. However, this method may return an information
2319                 dictionary that will be passed to the next postprocessing
2320                 object in the chain. It can be the one it received after
2321                 changing some fields.
2322
2323                 In addition, this method may raise a PostProcessingError
2324                 exception that will be taken into account by the downloader
2325                 it was called from.
2326                 """
2327                 return information # by default, do nothing
2328
2329 ### MAIN PROGRAM ###
2330 if __name__ == '__main__':
2331         try:
2332                 # Modules needed only when running the main program
2333                 import getpass
2334                 import optparse
2335
2336                 # Function to update the program file with the latest version from the repository.
2337                 def update_self(downloader, filename):
2338                         # Note: downloader only used for options
2339                         if not os.access(filename, os.W_OK):
2340                                 sys.exit('ERROR: no write permissions on %s' % filename)
2341
2342                         downloader.to_screen('Updating to latest stable version...')
2343                         try:
2344                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2345                                 latest_version = urllib.urlopen(latest_url).read().strip()
2346                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2347                                 newcontent = urllib.urlopen(prog_url).read()
2348                         except (IOError, OSError), err:
2349                                 sys.exit('ERROR: unable to download latest version')
2350                         try:
2351                                 stream = open(filename, 'w')
2352                                 stream.write(newcontent)
2353                                 stream.close()
2354                         except (IOError, OSError), err:
2355                                 sys.exit('ERROR: unable to overwrite current version')
2356                         downloader.to_screen('Updated to version %s' % latest_version)
2357
2358                 # Parse command line
2359                 parser = optparse.OptionParser(
2360                         usage='Usage: %prog [options] url...',
2361                         version='2010.12.09',
2362                         conflict_handler='resolve',
2363                 )
2364
2365                 parser.add_option('-h', '--help',
2366                                 action='help', help='print this help text and exit')
2367                 parser.add_option('-v', '--version',
2368                                 action='version', help='print program version and exit')
2369                 parser.add_option('-U', '--update',
2370                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2371                 parser.add_option('-i', '--ignore-errors',
2372                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2373                 parser.add_option('-r', '--rate-limit',
2374                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2375                 parser.add_option('-R', '--retries',
2376                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2377                 parser.add_option('--playlist-start',
2378                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2379                 parser.add_option('--playlist-end',
2380                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2381                 parser.add_option('--dump-user-agent',
2382                                 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2383
2384                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2385                 authentication.add_option('-u', '--username',
2386                                 dest='username', metavar='USERNAME', help='account username')
2387                 authentication.add_option('-p', '--password',
2388                                 dest='password', metavar='PASSWORD', help='account password')
2389                 authentication.add_option('-n', '--netrc',
2390                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2391                 parser.add_option_group(authentication)
2392
2393                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2394                 video_format.add_option('-f', '--format',
2395                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2396                 video_format.add_option('--all-formats',
2397                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2398                 video_format.add_option('--max-quality',
2399                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2400                 parser.add_option_group(video_format)
2401
2402                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2403                 verbosity.add_option('-q', '--quiet',
2404                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2405                 verbosity.add_option('-s', '--simulate',
2406                                 action='store_true', dest='simulate', help='do not download video', default=False)
2407                 verbosity.add_option('-g', '--get-url',
2408                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2409                 verbosity.add_option('-e', '--get-title',
2410                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2411                 verbosity.add_option('--get-thumbnail',
2412                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2413                 verbosity.add_option('--get-description',
2414                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2415                 verbosity.add_option('--get-filename',
2416                                 action='store_true', dest='getfilename', help='simulate, quiet but print output filename', default=False)
2417                 verbosity.add_option('--no-progress',
2418                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2419                 verbosity.add_option('--console-title',
2420                                 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2421                 parser.add_option_group(verbosity)
2422
2423                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2424                 filesystem.add_option('-t', '--title',
2425                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2426                 filesystem.add_option('-l', '--literal',
2427                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2428                 filesystem.add_option('-A', '--auto-number',
2429                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2430                 filesystem.add_option('-o', '--output',
2431                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2432                 filesystem.add_option('-a', '--batch-file',
2433                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2434                 filesystem.add_option('-w', '--no-overwrites',
2435                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2436                 filesystem.add_option('-c', '--continue',
2437                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2438                 filesystem.add_option('--cookies',
2439                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2440                 filesystem.add_option('--no-part',
2441                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2442                 parser.add_option_group(filesystem)
2443
2444                 (opts, args) = parser.parse_args()
2445
2446                 # Open appropriate CookieJar
2447                 if opts.cookiefile is None:
2448                         jar = cookielib.CookieJar()
2449                 else:
2450                         try:
2451                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2452                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2453                                         jar.load()
2454                         except (IOError, OSError), err:
2455                                 sys.exit(u'ERROR: unable to open cookie file')
2456
2457                 # Dump user agent
2458                 if opts.dump_user_agent:
2459                         print std_headers['User-Agent']
2460                         sys.exit(0)
2461
2462                 # General configuration
2463                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2464                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2465                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2466
2467                 # Batch file verification
2468                 batchurls = []
2469                 if opts.batchfile is not None:
2470                         try:
2471                                 if opts.batchfile == '-':
2472                                         batchfd = sys.stdin
2473                                 else:
2474                                         batchfd = open(opts.batchfile, 'r')
2475                                 batchurls = batchfd.readlines()
2476                                 batchurls = [x.strip() for x in batchurls]
2477                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2478                         except IOError:
2479                                 sys.exit(u'ERROR: batch file could not be read')
2480                 all_urls = batchurls + args
2481
2482                 # Conflicting, missing and erroneous options
2483                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2484                         parser.error(u'using .netrc conflicts with giving username/password')
2485                 if opts.password is not None and opts.username is None:
2486                         parser.error(u'account username missing')
2487                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2488                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2489                 if opts.usetitle and opts.useliteral:
2490                         parser.error(u'using title conflicts with using literal title')
2491                 if opts.username is not None and opts.password is None:
2492                         opts.password = getpass.getpass(u'Type account password and press return:')
2493                 if opts.ratelimit is not None:
2494                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2495                         if numeric_limit is None:
2496                                 parser.error(u'invalid rate limit specified')
2497                         opts.ratelimit = numeric_limit
2498                 if opts.retries is not None:
2499                         try:
2500                                 opts.retries = long(opts.retries)
2501                         except (TypeError, ValueError), err:
2502                                 parser.error(u'invalid retry count specified')
2503                 try:
2504                         opts.playliststart = long(opts.playliststart)
2505                         if opts.playliststart <= 0:
2506                                 raise ValueError
2507                 except (TypeError, ValueError), err:
2508                         parser.error(u'invalid playlist start number specified')
2509                 try:
2510                         opts.playlistend = long(opts.playlistend)
2511                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2512                                 raise ValueError
2513                 except (TypeError, ValueError), err:
2514                         parser.error(u'invalid playlist end number specified')
2515
2516                 # Information extractors
2517                 youtube_ie = YoutubeIE()
2518                 metacafe_ie = MetacafeIE(youtube_ie)
2519                 dailymotion_ie = DailymotionIE()
2520                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2521                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2522                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2523                 google_ie = GoogleIE()
2524                 google_search_ie = GoogleSearchIE(google_ie)
2525                 photobucket_ie = PhotobucketIE()
2526                 yahoo_ie = YahooIE()
2527                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2528                 deposit_files_ie = DepositFilesIE()
2529                 generic_ie = GenericIE()
2530
2531                 # File downloader
2532                 fd = FileDownloader({
2533                         'usenetrc': opts.usenetrc,
2534                         'username': opts.username,
2535                         'password': opts.password,
2536                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2537                         'forceurl': opts.geturl,
2538                         'forcetitle': opts.gettitle,
2539                         'forcethumbnail': opts.getthumbnail,
2540                         'forcedescription': opts.getdescription,
2541                         'forcefilename': opts.getfilename,
2542                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2543                         'format': opts.format,
2544                         'format_limit': opts.format_limit,
2545                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2546                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2547                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2548                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2549                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2550                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2551                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2552                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2553                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2554                                 or u'%(id)s.%(ext)s'),
2555                         'ignoreerrors': opts.ignoreerrors,
2556                         'ratelimit': opts.ratelimit,
2557                         'nooverwrites': opts.nooverwrites,
2558                         'retries': opts.retries,
2559                         'continuedl': opts.continue_dl,
2560                         'noprogress': opts.noprogress,
2561                         'playliststart': opts.playliststart,
2562                         'playlistend': opts.playlistend,
2563                         'logtostderr': opts.outtmpl == '-',
2564                         'consoletitle': opts.consoletitle,
2565                         'nopart': opts.nopart,
2566                         })
2567                 fd.add_info_extractor(youtube_search_ie)
2568                 fd.add_info_extractor(youtube_pl_ie)
2569                 fd.add_info_extractor(youtube_user_ie)
2570                 fd.add_info_extractor(metacafe_ie)
2571                 fd.add_info_extractor(dailymotion_ie)
2572                 fd.add_info_extractor(youtube_ie)
2573                 fd.add_info_extractor(google_ie)
2574                 fd.add_info_extractor(google_search_ie)
2575                 fd.add_info_extractor(photobucket_ie)
2576                 fd.add_info_extractor(yahoo_ie)
2577                 fd.add_info_extractor(yahoo_search_ie)
2578                 fd.add_info_extractor(deposit_files_ie)
2579
2580                 # This must come last since it's the
2581                 # fallback if none of the others work
2582                 fd.add_info_extractor(generic_ie)
2583
2584                 # Update version
2585                 if opts.update_self:
2586                         update_self(fd, sys.argv[0])
2587
2588                 # Maybe do nothing
2589                 if len(all_urls) < 1:
2590                         if not opts.update_self:
2591                                 parser.error(u'you must provide at least one URL')
2592                         else:
2593                                 sys.exit()
2594                 retcode = fd.download(all_urls)
2595
2596                 # Dump cookie jar if requested
2597                 if opts.cookiefile is not None:
2598                         try:
2599                                 jar.save()
2600                         except (IOError, OSError), err:
2601                                 sys.exit(u'ERROR: unable to save cookie jar')
2602
2603                 sys.exit(retcode)
2604
2605         except DownloadError:
2606                 sys.exit(1)
2607         except SameFileError:
2608                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2609         except KeyboardInterrupt:
2610                 sys.exit(u'\nERROR: Interrupted by user')