Python 3 version of HTMLParser

[youtube-dl] / youtube_dl / FileDownloader.py
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py

index ed5a79f13d1b12d45c03f4f676282adf41d51714..868023db9f2d1233ed96ac680a8b0966336f27ef 100644 (file)
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -9,7 +9,6 @@ import socket
  import subprocess
  import sys
  import time
-import urllib2
  
  if os.name == 'nt':
         import ctypes
@@ -44,37 +43,40 @@ class FileDownloader(object):
  
         Available options:
  
-       username:         Username for authentication purposes.
-       password:         Password for authentication purposes.
-       usenetrc:         Use netrc for authentication instead.
-       quiet:            Do not print messages to stdout.
-       forceurl:         Force printing final URL.
-       forcetitle:       Force printing title.
-       forcethumbnail:   Force printing thumbnail URL.
-       forcedescription: Force printing description.
-       forcefilename:    Force printing final filename.
-       simulate:         Do not download the video files.
-       format:           Video format code.
-       format_limit:     Highest quality format to try.
-       outtmpl:          Template for output names.
-       ignoreerrors:     Do not stop on download errors.
-       ratelimit:        Download speed limit, in bytes/sec.
-       nooverwrites:     Prevent overwriting files.
-       retries:          Number of times to retry for HTTP error 5xx
-       continuedl:       Try to continue downloads if possible.
-       noprogress:       Do not print the progress bar.
-       playliststart:    Playlist item to start at.
-       playlistend:      Playlist item to end at.
-       matchtitle:       Download only matching titles.
-       rejecttitle:      Reject downloads for matching titles.
-       logtostderr:      Log messages to stderr instead of stdout.
-       consoletitle:     Display progress in console window's titlebar.
-       nopart:           Do not use temporary .part files.
-       updatetime:       Use the Last-modified header to set output file timestamps.
-       writedescription: Write the video description to a .description file
-       writeinfojson:    Write the video description to a .info.json file
-       writesubtitles:   Write the video subtitles to a .srt file
-       subtitleslang:    Language of the subtitles to download
+       username:          Username for authentication purposes.
+       password:          Password for authentication purposes.
+       usenetrc:          Use netrc for authentication instead.
+       quiet:             Do not print messages to stdout.
+       forceurl:          Force printing final URL.
+       forcetitle:        Force printing title.
+       forcethumbnail:    Force printing thumbnail URL.
+       forcedescription:  Force printing description.
+       forcefilename:     Force printing final filename.
+       simulate:          Do not download the video files.
+       format:            Video format code.
+       format_limit:      Highest quality format to try.
+       outtmpl:           Template for output names.
+       restrictfilenames: Do not allow "&" and spaces in file names
+       ignoreerrors:      Do not stop on download errors.
+       ratelimit:         Download speed limit, in bytes/sec.
+       nooverwrites:      Prevent overwriting files.
+       retries:           Number of times to retry for HTTP error 5xx
+       buffersize:        Size of download buffer in bytes.
+       noresizebuffer:    Do not automatically resize the download buffer.
+       continuedl:        Try to continue downloads if possible.
+       noprogress:        Do not print the progress bar.
+       playliststart:     Playlist item to start at.
+       playlistend:       Playlist item to end at.
+       matchtitle:        Download only matching titles.
+       rejecttitle:       Reject downloads for matching titles.
+       logtostderr:       Log messages to stderr instead of stdout.
+       consoletitle:      Display progress in console window's titlebar.
+       nopart:            Do not use temporary .part files.
+       updatetime:        Use the Last-modified header to set output file timestamps.
+       writedescription:  Write the video description to a .description file
+       writeinfojson:     Write the video description to a .info.json file
+       writesubtitles:    Write the video subtitles to a .srt file
+       subtitleslang:     Language of the subtitles to download
         """
  
         params = None
@@ -93,6 +95,9 @@ class FileDownloader(object):
                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
                 self.params = params
  
+               if '%(stitle)s' in self.params['outtmpl']:
+                       self.to_stderr(u'WARNING: %(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
+
         @staticmethod
         def format_bytes(bytes):
                 if bytes is None:
@@ -102,7 +107,7 @@ class FileDownloader(object):
                 if bytes == 0.0:
                         exponent = 0
                 else:
-                       exponent = long(math.log(bytes, 1024.0))
+                       exponent = int(math.log(bytes, 1024.0))
                 suffix = 'bkMGTPEZY'[exponent]
                 converted = float(bytes) / float(1024 ** exponent)
                 return '%.2f%s' % (converted, suffix)
@@ -121,7 +126,7 @@ class FileDownloader(object):
                 if current == 0 or dif < 0.001: # One millisecond
                         return '--:--'
                 rate = float(current) / dif
-               eta = long((float(total) - float(current)) / rate)
+               eta = int((float(total) - float(current)) / rate)
                 (eta_mins, eta_secs) = divmod(eta, 60)
                 if eta_mins > 99:
                         return '--:--'
@@ -139,23 +144,23 @@ class FileDownloader(object):
                 new_min = max(bytes / 2.0, 1.0)
                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
                 if elapsed_time < 0.001:
-                       return long(new_max)
+                       return int(new_max)
                 rate = bytes / elapsed_time
                 if rate > new_max:
-                       return long(new_max)
+                       return int(new_max)
                 if rate < new_min:
-                       return long(new_min)
-               return long(rate)
+                       return int(new_min)
+               return int(rate)
  
         @staticmethod
         def parse_bytes(bytestr):
-               """Parse a string indicating a byte quantity into a long integer."""
+               """Parse a string indicating a byte quantity into an integer."""
                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
                 if matchobj is None:
                         return None
                 number = float(matchobj.group(1))
                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
-               return long(round(number * multiplier))
+               return int(round(number * multiplier))
  
         def add_info_extractor(self, ie):
                 """Add an InfoExtractor object to the end of the list."""
@@ -173,14 +178,15 @@ class FileDownloader(object):
                 if not self.params.get('quiet', False):
                         terminator = [u'\n', u''][skip_eol]
                         output = message + terminator
-                       if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
+                       if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
                                 output = output.encode(preferredencoding(), 'ignore')
                         self._screen_file.write(output)
                         self._screen_file.flush()
  
         def to_stderr(self, message):
                 """Print message to stderr."""
-               print >>sys.stderr, message.encode(preferredencoding())
+               assert type(message) == type(u'')
+               sys.stderr.write((message + u'\n').encode(preferredencoding()))
  
         def to_cons_title(self, message):
                 """Set console/terminal window title to message."""
@@ -195,7 +201,7 @@ class FileDownloader(object):
  
         def fixed_template(self):
                 """Checks if the output template is fixed."""
-               return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
+               return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
  
         def trouble(self, message=None):
                 """Determine action to take when a download problem appears.
@@ -240,7 +246,7 @@ class FileDownloader(object):
                         if old_filename == new_filename:
                                 return
                         os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
-               except (IOError, OSError), err:
+               except (IOError, OSError) as err:
                         self.trouble(u'ERROR: unable to rename file')
  
         def try_utime(self, filename, last_modified_hdr):
@@ -298,7 +304,7 @@ class FileDownloader(object):
                 """Report file has already been fully downloaded."""
                 try:
                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
-               except (UnicodeEncodeError), err:
+               except (UnicodeEncodeError) as err:
                         self.to_screen(u'[download] The file has already been downloaded')
  
         def report_unable_to_resume(self):
@@ -320,11 +326,16 @@ class FileDownloader(object):
                 """Generate the output filename."""
                 try:
                         template_dict = dict(info_dict)
-                       template_dict['epoch'] = unicode(long(time.time()))
-                       template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
+
+                       template_dict['epoch'] = int(time.time())
+                       template_dict['autonumber'] = u'%05d' % self._num_downloads
+
+                       template_dict = dict((key, u'NA' if val is None else val) for key, val in template_dict.items())
+                       template_dict = dict((k, sanitize_filename(compat_str(v), self.params.get('restrictfilenames'))) for k,v in template_dict.items())
+
                         filename = self.params['outtmpl'] % template_dict
                         return filename
-               except (ValueError, KeyError), err:
+               except (ValueError, KeyError) as err:
                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
                         return None
  
@@ -333,17 +344,25 @@ class FileDownloader(object):
  
                 title = info_dict['title']
                 matchtitle = self.params.get('matchtitle', False)
-               if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
-                       return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+               if matchtitle:
+                       matchtitle = matchtitle.decode('utf8')
+                       if not re.search(matchtitle, title, re.IGNORECASE):
+                               return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
                 rejecttitle = self.params.get('rejecttitle', False)
-               if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
-                       return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+               if rejecttitle:
+                       rejecttitle = rejecttitle.decode('utf8')
+                       if re.search(rejecttitle, title, re.IGNORECASE):
+                               return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
                 return None
  
         def process_info(self, info_dict):
                 """Process a single dictionary returned by an InfoExtractor."""
  
-               info_dict['stitle'] = sanitize_filename(info_dict['title'])
+               # Keep for backwards compatibility
+               info_dict['stitle'] = info_dict['title']
+
+               if not 'format' in info_dict:
+                       info_dict['format'] = info_dict['ext']
  
                 reason = self._match_entry(info_dict)
                 if reason is not None:
@@ -359,17 +378,17 @@ class FileDownloader(object):
  
                 # Forced printings
                 if self.params.get('forcetitle', False):
-                       print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
+                       print(info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace'))
                 if self.params.get('forceurl', False):
-                       print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
+                       print(info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace'))
                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
-                       print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
+                       print(info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace'))
                 if self.params.get('forcedescription', False) and 'description' in info_dict:
-                       print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
+                       print(info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace'))
                 if self.params.get('forcefilename', False) and filename is not None:
-                       print filename.encode(preferredencoding(), 'xmlcharrefreplace')
+                       print(filename.encode(preferredencoding(), 'xmlcharrefreplace'))
                 if self.params.get('forceformat', False):
-                       print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
+                       print(info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace'))
  
                 # Do nothing else if in simulate mode
                 if self.params.get('simulate', False):
@@ -382,8 +401,8 @@ class FileDownloader(object):
                         dn = os.path.dirname(encodeFilename(filename))
                         if dn != '' and not os.path.exists(dn): # dn is already encoded
                                 os.makedirs(dn)
-               except (OSError, IOError), err:
-                       self.trouble(u'ERROR: unable to create directory ' + unicode(err))
+               except (OSError, IOError) as err:
+                       self.trouble(u'ERROR: unable to create directory ' + compat_str(err))
                         return
  
                 if self.params.get('writedescription', False):
@@ -439,19 +458,19 @@ class FileDownloader(object):
                         else:
                                 try:
                                         success = self._do_download(filename, info_dict)
-                               except (OSError, IOError), err:
+                               except (OSError, IOError) as err:
                                         raise UnavailableVideoError
-                               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                               except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
                                         return
-                               except (ContentTooShortError, ), err:
+                               except (ContentTooShortError, ) as err:
                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
                                         return
  
                         if success:
                                 try:
                                         self.post_process(filename, info_dict)
-                               except (PostProcessingError), err:
+                               except (PostProcessingError) as err:
                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
                                         return
  
@@ -467,6 +486,11 @@ class FileDownloader(object):
                                 if not ie.suitable(url):
                                         continue
  
+                               # Warn if the _WORKING attribute is False
+                               if not ie.working():
+                                       self.trouble(u'WARNING: the program functionality for this site has been marked as broken, '
+                                                        u'and will probably not work. If you want to go on, use the -i option.')
+
                                 # Suitable InfoExtractor found
                                 suitable_found = True
  
@@ -560,8 +584,8 @@ class FileDownloader(object):
  
                 # Do not include the Accept-Encoding header
                 headers = {'Youtubedl-no-compression': 'True'}
-               basic_request = urllib2.Request(url, None, headers)
-               request = urllib2.Request(url, None, headers)
+               basic_request = compat_urllib_request.Request(url, None, headers)
+               request = compat_urllib_request.Request(url, None, headers)
  
                 # Establish possible resume length
                 if os.path.isfile(encodeFilename(tmpfilename)):
@@ -585,9 +609,9 @@ class FileDownloader(object):
                         try:
                                 if count == 0 and 'urlhandle' in info_dict:
                                         data = info_dict['urlhandle']
-                               data = urllib2.urlopen(request)
+                               data = compat_urllib_request.urlopen(request)
                                 break
-                       except (urllib2.HTTPError, ), err:
+                       except (compat_urllib_error.HTTPError, ) as err:
                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
                                         # Unexpected HTTP error
                                         raise
@@ -595,15 +619,15 @@ class FileDownloader(object):
                                         # Unable to resume (requested range not satisfiable)
                                         try:
                                                 # Open the connection again without the range header
-                                               data = urllib2.urlopen(basic_request)
+                                               data = compat_urllib_request.urlopen(basic_request)
                                                 content_length = data.info()['Content-Length']
-                                       except (urllib2.HTTPError, ), err:
+                                       except (compat_urllib_error.HTTPError, ) as err:
                                                 if err.code < 500 or err.code >= 600:
                                                         raise
                                         else:
                                                 # Examine the reported length
                                                 if (content_length is not None and
-                                                               (resume_len - 100 < long(content_length) < resume_len + 100)):
+                                                               (resume_len - 100 < int(content_length) < resume_len + 100)):
                                                         # The file had already been fully downloaded.
                                                         # Explanation to the above condition: in issue #175 it was revealed that
                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
@@ -630,10 +654,10 @@ class FileDownloader(object):
  
                 data_len = data.info().get('Content-length', None)
                 if data_len is not None:
-                       data_len = long(data_len) + resume_len
+                       data_len = int(data_len) + resume_len
                 data_len_str = self.format_bytes(data_len)
                 byte_counter = 0 + resume_len
-               block_size = 1024
+               block_size = self.params.get('buffersize', 1024)
                 start = time.time()
                 while True:
                         # Download and write
@@ -651,15 +675,16 @@ class FileDownloader(object):
                                         assert stream is not None
                                         filename = self.undo_temp_name(tmpfilename)
                                         self.report_destination(filename)
-                               except (OSError, IOError), err:
+                               except (OSError, IOError) as err:
                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
                                         return False
                         try:
                                 stream.write(data_block)
-                       except (IOError, OSError), err:
+                       except (IOError, OSError) as err:
                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
                                 return False
-                       block_size = self.best_block_size(after - before, len(data_block))
+                       if not self.params.get('noresizebuffer', False):
+                               block_size = self.best_block_size(after - before, len(data_block))
  
                         # Progress message
                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
@@ -679,7 +704,7 @@ class FileDownloader(object):
                 stream.close()
                 self.report_finish()
                 if data_len is not None and byte_counter != data_len:
-                       raise ContentTooShortError(byte_counter, long(data_len))
+                       raise ContentTooShortError(byte_counter, int(data_len))
                 self.try_rename(tmpfilename, filename)
  
                 # Update file modification time