Use u instead of str in Python 2

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index a64937b4c12077fef92c95d0ddd36e30de0e8917..bde446bcbcbeb68bc7042c58e64e32893704c273 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -26,25 +26,28 @@ std_headers = {
         'Accept-Language': 'en-us,en;q=0.5',
  }
  
+try:
+       u = unicode # Python 2
+except NameError:
+       u = str
+
  def preferredencoding():
         """Get preferred encoding.
  
         Returns the best encoding scheme for the system, based on
         locale.getpreferredencoding() and some further tweaks.
         """
-       def yield_preferredencoding():
-               try:
-                       pref = locale.getpreferredencoding()
-                       u'TEST'.encode(pref)
-               except:
-                       pref = 'UTF-8'
-               while True:
-                       yield pref
-       return yield_preferredencoding().next()
+       try:
+               pref = locale.getpreferredencoding()
+               u'TEST'.encode(pref)
+       except:
+               pref = 'UTF-8'
+
+       return pref
  
  
  def htmlentity_transform(matchobj):
-       """Transforms an HTML entity to a Unicode character.
+       """Transforms an HTML entity to a character.
  
         This function receives a match object and is intended to be used with
         the re.sub() function.
@@ -55,7 +58,6 @@ def htmlentity_transform(matchobj):
         if entity in htmlentitydefs.name2codepoint:
                 return unichr(htmlentitydefs.name2codepoint[entity])
  
-       # Unicode character
         mobj = re.match(ur'(?u)#(x?\d+)', entity)
         if mobj is not None:
                 numstr = mobj.group(1)
@@ -64,7 +66,7 @@ def htmlentity_transform(matchobj):
                         numstr = u'0%s' % numstr
                 else:
                         base = 10
-               return unichr(long(numstr, base))
+               return unichr(int(numstr, base))
  
         # Unknown entity in name, return its literal representation
         return (u'&%s;' % entity)
@@ -83,7 +85,6 @@ class IDParser(HTMLParser.HTMLParser):
                 HTMLParser.HTMLParser.__init__(self)
  
         def error(self, message):
-               print >> sys.stderr, self.getpos()
                 if self.error_count > 10 or self.started:
                         raise HTMLParser.HTMLParseError(message, self.getpos())
                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
@@ -124,8 +125,10 @@ class IDParser(HTMLParser.HTMLParser):
         handle_decl = handle_pi = unknown_decl = find_startpos
  
         def get_result(self):
-               if self.result == None: return None
-               if len(self.result) != 3: return None
+               if self.result is None:
+                       return None
+               if len(self.result) != 3:
+                       return None
                 lines = self.html.split('\n')
                 lines = lines[self.result[1][0]-1:self.result[2][0]]
                 lines[0] = lines[0][self.result[1][1]:]
@@ -190,24 +193,36 @@ def timeconvert(timestr):
         if timetuple is not None:
                 timestamp = email.utils.mktime_tz(timetuple)
         return timestamp
-       
-def sanitize_filename(s):
-       """Sanitizes a string so it could be used as part of a filename."""
+
+def sanitize_filename(s, restricted=False):
+       """Sanitizes a string so it could be used as part of a filename.
+       If restricted is set, use a stricter subset of allowed characters.
+       """
         def replace_insane(char):
                 if char == '?' or ord(char) < 32 or ord(char) == 127:
                         return ''
                 elif char == '"':
-                       return '\''
+                       return '' if restricted else '\''
                 elif char == ':':
-                       return ' -'
+                       return '_-' if restricted else ' -'
                 elif char in '\\/|*<>':
-                       return '-'
+                       return '_'
+               if restricted and (char in '!&\'' or char.isspace()):
+                       return '_'
+               if restricted and ord(char) > 127:
+                       return '_'
                 return char
  
         result = u''.join(map(replace_insane, s))
-       while '--' in result:
-               result = result.replace('--', '-')
-       return result.strip('-')
+       while '__' in result:
+               result = result.replace('__', '_')
+       result = result.strip('_')
+       # Common case of "Foreign band name - English song title"
+       if restricted and result.startswith('-_'):
+               result = result[2:]
+       if not result:
+               result = '_'
+       return result
  
  def orderedSet(iterable):
         """ Remove all duplicates from the input iterable """
@@ -219,7 +234,7 @@ def orderedSet(iterable):
  
  def unescapeHTML(s):
         """
-       @param s a string (of type unicode)
+       @param s a string
         """
         assert type(s) == type(u'')
  
@@ -228,7 +243,7 @@ def unescapeHTML(s):
  
  def encodeFilename(s):
         """
-       @param s The name of the file (of type unicode)
+       @param s The name of the file
         """
  
         assert type(s) == type(u'')
@@ -300,7 +315,7 @@ class ContentTooShortError(Exception):
  
  class Trouble(Exception):
         """Trouble helper exception
-       
+
         This is an exception to be handled with
         FileDownloader.trouble
         """