import gzip
import io
+import json
import locale
import os
import re
import sys
+import traceback
import zlib
import email.utils
import json
except ImportError: # Python 2
import httplib as compat_http_client
+try:
+ from subprocess import DEVNULL
+ compat_subprocess_get_DEVNULL = lambda: DEVNULL
+except ImportError:
+ compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
+
try:
from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-us,en;q=0.5',
}
+
def preferredencoding():
"""Get preferred encoding.
assert type(s) == type(u'')
print(s)
+# In Python 2.x, json.dump expects a bytestream.
+# In Python 3.x, it writes to a character stream
+if sys.version_info < (3,0):
+ def write_json_file(obj, fn):
+ with open(fn, 'wb') as f:
+ json.dump(obj, f)
+else:
+ def write_json_file(obj, fn):
+ with open(fn, 'w', encoding='utf-8') as f:
+ json.dump(obj, f)
+
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a character.
return (u'&%s;' % entity)
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class IDParser(compat_html_parser.HTMLParser):
- """Modified HTMLParser that isolates a tag with the specified id"""
- def __init__(self, id):
- self.id = id
+class AttrParser(compat_html_parser.HTMLParser):
+ """Modified HTMLParser that isolates a tag with the specified attribute"""
+ def __init__(self, attribute, value):
+ self.attribute = attribute
+ self.value = value
self.result = None
self.started = False
self.depth = {}
attrs = dict(attrs)
if self.started:
self.find_startpos(None)
- if 'id' in attrs and attrs['id'] == self.id:
+ if self.attribute in attrs and attrs[self.attribute] == self.value:
self.result = [tag]
self.started = True
self.watch_startpos = True
return '\n'.join(lines).strip()
def get_element_by_id(id, html):
- """Return the content of the tag with the specified id in the passed HTML document"""
- parser = IDParser(id)
+ """Return the content of the tag with the specified ID in the passed HTML document"""
+ return get_element_by_attribute("id", id, html)
+
+def get_element_by_attribute(attribute, value, html):
+ """Return the content of the tag with the specified attribute in the passed HTML document"""
+ parser = AttrParser(attribute, value)
try:
parser.loads(html)
except compat_html_parser.HTMLParseError:
"""Clean an HTML snippet into a readable string"""
# Newline vs <br />
html = html.replace('\n', ' ')
- html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
timestamp = email.utils.mktime_tz(timetuple)
return timestamp
-def sanitize_filename(s, restricted=False):
+def sanitize_filename(s, restricted=False, is_id=False):
"""Sanitizes a string so it could be used as part of a filename.
If restricted is set, use a stricter subset of allowed characters.
+ Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
"""
def replace_insane(char):
if char == '?' or ord(char) < 32 or ord(char) == 127:
return char
result = u''.join(map(replace_insane, s))
- while '__' in result:
- result = result.replace('__', '_')
- result = result.strip('_')
- # Common case of "Foreign band name - English song title"
- if restricted and result.startswith('-_'):
- result = result[2:]
- if not result:
- result = '_'
+ if not is_id:
+ while '__' in result:
+ result = result.replace('__', '_')
+ result = result.strip('_')
+ # Common case of "Foreign band name - English song title"
+ if restricted and result.startswith('-_'):
+ result = result[2:]
+ if not result:
+ result = '_'
return result
def orderedSet(iterable):
else:
return s.encode(sys.getfilesystemencoding(), 'ignore')
+
+class ExtractorError(Exception):
+ """Error during info extraction."""
+ def __init__(self, msg, tb=None):
+ """ tb, if given, is the original traceback (so that it can be printed out). """
+ super(ExtractorError, self).__init__(msg)
+ self.traceback = tb
+
+ def format_traceback(self):
+ if self.traceback is None:
+ return None
+ return u''.join(traceback.format_tb(self.traceback))
+
+
class DownloadError(Exception):
"""Download Error exception.
self.downloaded = downloaded
self.expected = expected
-
-class Trouble(Exception):
- """Trouble helper exception
-
- This is an exception to be handled with
- FileDownloader.trouble
- """
-
class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
"""Handler for HTTP requests and responses.
resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
+
+ https_request = http_request
+ https_response = http_response