'Accept-Language': 'en-us,en;q=0.5',
}
+try:
+ u = unicode # Python 2
+except NameError:
+ u = str
+
def preferredencoding():
"""Get preferred encoding.
Returns the best encoding scheme for the system, based on
locale.getpreferredencoding() and some further tweaks.
"""
- def yield_preferredencoding():
- try:
- pref = locale.getpreferredencoding()
- u'TEST'.encode(pref)
- except:
- pref = 'UTF-8'
- while True:
- yield pref
- return yield_preferredencoding().next()
+ try:
+ pref = locale.getpreferredencoding()
+ u'TEST'.encode(pref)
+ except:
+ pref = 'UTF-8'
+
+ return pref
def htmlentity_transform(matchobj):
- """Transforms an HTML entity to a Unicode character.
+ """Transforms an HTML entity to a character.
This function receives a match object and is intended to be used with
the re.sub() function.
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
- # Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
numstr = u'0%s' % numstr
else:
base = 10
- return unichr(long(numstr, base))
+ return unichr(int(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
HTMLParser.HTMLParser.__init__(self)
def error(self, message):
- print >> sys.stderr, self.getpos()
if self.error_count > 10 or self.started:
raise HTMLParser.HTMLParseError(message, self.getpos())
self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
handle_decl = handle_pi = unknown_decl = find_startpos
def get_result(self):
- if self.result == None: return None
- if len(self.result) != 3: return None
+ if self.result is None:
+ return None
+ if len(self.result) != 3:
+ return None
lines = self.html.split('\n')
lines = lines[self.result[1][0]-1:self.result[2][0]]
lines[0] = lines[0][self.result[1][1]:]
if timetuple is not None:
timestamp = email.utils.mktime_tz(timetuple)
return timestamp
-
-def sanitize_filename(s):
- """Sanitizes a string so it could be used as part of a filename."""
+
+def sanitize_filename(s, restricted=False):
+ """Sanitizes a string so it could be used as part of a filename.
+ If restricted is set, use a stricter subset of allowed characters.
+ """
def replace_insane(char):
if char == '?' or ord(char) < 32 or ord(char) == 127:
return ''
elif char == '"':
- return '\''
+ return '' if restricted else '\''
elif char == ':':
- return ' -'
+ return '_-' if restricted else ' -'
elif char in '\\/|*<>':
- return '-'
+ return '_'
+ if restricted and (char in '!&\'' or char.isspace()):
+ return '_'
+ if restricted and ord(char) > 127:
+ return '_'
return char
result = u''.join(map(replace_insane, s))
- while '--' in result:
- result = result.replace('--', '-')
- return result.strip('-')
+ while '__' in result:
+ result = result.replace('__', '_')
+ result = result.strip('_')
+ # Common case of "Foreign band name - English song title"
+ if restricted and result.startswith('-_'):
+ result = result[2:]
+ if not result:
+ result = '_'
+ return result
def orderedSet(iterable):
""" Remove all duplicates from the input iterable """
def unescapeHTML(s):
"""
- @param s a string (of type unicode)
+ @param s a string
"""
assert type(s) == type(u'')
def encodeFilename(s):
"""
- @param s The name of the file (of type unicode)
+ @param s The name of the file
"""
assert type(s) == type(u'')
class Trouble(Exception):
"""Trouble helper exception
-
+
This is an exception to be handled with
FileDownloader.trouble
"""