'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-us,en;q=0.5',
}
+
def preferredencoding():
"""Get preferred encoding.
with open(fn, 'w', encoding='utf-8') as f:
json.dump(obj, f)
-
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a character.
"""Clean an HTML snippet into a readable string"""
# Newline vs <br />
html = html.replace('\n', ' ')
- html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
else:
return s.encode(sys.getfilesystemencoding(), 'ignore')
+
+class ExtractorError(Exception):
+ """Error during info extraction."""
+ def __init__(self, msg, tb=None):
+ """ tb is the original traceback (so that it can be printed out) """
+ super(ExtractorError, self).__init__(msg)
+ if tb is None:
+ tb = sys.exc_info()[2]
+ self.traceback = tb
+
+
class DownloadError(Exception):
"""Download Error exception.