import gzip
import io
+import json
import locale
import os
import re
import sys
+import traceback
import zlib
import email.utils
import json
except ImportError: # Python 2
import urllib as compat_urllib_parse
+try:
+ from urllib.parse import urlparse as compat_urllib_parse_urlparse
+except ImportError: # Python 2
+ from urlparse import urlparse as compat_urllib_parse_urlparse
+
try:
import http.cookiejar as compat_cookiejar
except ImportError: # Python 2
except ImportError: # Python 2
import httplib as compat_http_client
+try:
+ from subprocess import DEVNULL
+ compat_subprocess_get_DEVNULL = lambda: DEVNULL
+except ImportError:
+ compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
+
try:
from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-us,en;q=0.5',
}
+
def preferredencoding():
"""Get preferred encoding.
assert type(s) == type(u'')
print(s)
+# In Python 2.x, json.dump expects a bytestream.
+# In Python 3.x, it writes to a character stream
+if sys.version_info < (3,0):
+ def write_json_file(obj, fn):
+ with open(fn, 'wb') as f:
+ json.dump(obj, f)
+else:
+ def write_json_file(obj, fn):
+ with open(fn, 'w', encoding='utf-8') as f:
+ json.dump(obj, f)
+
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a character.
return (u'&%s;' % entity)
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class IDParser(compat_html_parser.HTMLParser):
- """Modified HTMLParser that isolates a tag with the specified id"""
- def __init__(self, id):
- self.id = id
+class AttrParser(compat_html_parser.HTMLParser):
+ """Modified HTMLParser that isolates a tag with the specified attribute"""
+ def __init__(self, attribute, value):
+ self.attribute = attribute
+ self.value = value
self.result = None
self.started = False
self.depth = {}
attrs = dict(attrs)
if self.started:
self.find_startpos(None)
- if 'id' in attrs and attrs['id'] == self.id:
+ if self.attribute in attrs and attrs[self.attribute] == self.value:
self.result = [tag]
self.started = True
self.watch_startpos = True
lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
lines[-1] = lines[-1][:self.result[2][1]]
return '\n'.join(lines).strip()
+# Hack for https://github.com/rg3/youtube-dl/issues/662
+if sys.version_info < (2, 7, 3):
+ AttrParser.parse_endtag = (lambda self, i:
+ i + len("</scr'+'ipt>")
+ if self.rawdata[i:].startswith("</scr'+'ipt>")
+ else compat_html_parser.HTMLParser.parse_endtag(self, i))
def get_element_by_id(id, html):
- """Return the content of the tag with the specified id in the passed HTML document"""
- parser = IDParser(id)
+ """Return the content of the tag with the specified ID in the passed HTML document"""
+ return get_element_by_attribute("id", id, html)
+
+def get_element_by_attribute(attribute, value, html):
+ """Return the content of the tag with the specified attribute in the passed HTML document"""
+ parser = AttrParser(attribute, value)
try:
parser.loads(html)
except compat_html_parser.HTMLParseError:
"""Clean an HTML snippet into a readable string"""
# Newline vs <br />
html = html.replace('\n', ' ')
- html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
html = unescapeHTML(html)
- return html
+ return html.strip()
def sanitize_open(filename, open_mode):
if sys.platform == 'win32':
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
- return (sys.stdout, filename)
+ return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
stream = open(encodeFilename(filename), open_mode)
return (stream, filename)
except (IOError, OSError) as err:
timestamp = email.utils.mktime_tz(timetuple)
return timestamp
-def sanitize_filename(s, restricted=False):
+def sanitize_filename(s, restricted=False, is_id=False):
"""Sanitizes a string so it could be used as part of a filename.
If restricted is set, use a stricter subset of allowed characters.
+ Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
"""
def replace_insane(char):
if char == '?' or ord(char) < 32 or ord(char) == 127:
return '_-' if restricted else ' -'
elif char in '\\/|*<>':
return '_'
- if restricted and (char in '!&\'' or char.isspace()):
+ if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
return '_'
if restricted and ord(char) > 127:
return '_'
return char
result = u''.join(map(replace_insane, s))
- while '__' in result:
- result = result.replace('__', '_')
- result = result.strip('_')
- # Common case of "Foreign band name - English song title"
- if restricted and result.startswith('-_'):
- result = result[2:]
- if not result:
- result = '_'
+ if not is_id:
+ while '__' in result:
+ result = result.replace('__', '_')
+ result = result.strip('_')
+ # Common case of "Foreign band name - English song title"
+ if restricted and result.startswith('-_'):
+ result = result[2:]
+ if not result:
+ result = '_'
return result
def orderedSet(iterable):
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
return s
else:
- return s.encode(sys.getfilesystemencoding(), 'ignore')
+ encoding = sys.getfilesystemencoding()
+ if encoding is None:
+ encoding = 'utf-8'
+ return s.encode(encoding, 'ignore')
+
+def decodeOption(optval):
+ if optval is None:
+ return optval
+ if isinstance(optval, bytes):
+ optval = optval.decode(preferredencoding())
+
+ assert isinstance(optval, compat_str)
+ return optval
+
+class ExtractorError(Exception):
+ """Error during info extraction."""
+ def __init__(self, msg, tb=None):
+ """ tb, if given, is the original traceback (so that it can be printed out). """
+ super(ExtractorError, self).__init__(msg)
+ self.traceback = tb
+ self.exc_info = sys.exc_info() # preserve original exception
+
+ def format_traceback(self):
+ if self.traceback is None:
+ return None
+ return u''.join(traceback.format_tb(self.traceback))
+
class DownloadError(Exception):
"""Download Error exception.
configured to continue on errors. They will contain the appropriate
error message.
"""
- pass
+ def __init__(self, msg, exc_info=None):
+ """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
+ super(DownloadError, self).__init__(msg)
+ self.exc_info = exc_info
class SameFileError(Exception):
This exception may be raised by PostProcessor's .run() method to
indicate an error in the postprocessing task.
"""
- pass
+ def __init__(self, msg):
+ self.msg = msg
class MaxDownloadsReached(Exception):
""" --max-downloads limit has been reached. """
self.downloaded = downloaded
self.expected = expected
-
-class Trouble(Exception):
- """Trouble helper exception
-
- This is an exception to be handled with
- FileDownloader.trouble
- """
-
class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
"""Handler for HTTP requests and responses.
return ret
def http_request(self, req):
- for h in std_headers:
+ for h,v in std_headers.items():
if h in req.headers:
del req.headers[h]
- req.add_header(h, std_headers[h])
+ req.add_header(h, v)
if 'Youtubedl-no-compression' in req.headers:
if 'Accept-encoding' in req.headers:
del req.headers['Accept-encoding']
del req.headers['Youtubedl-no-compression']
+ if 'Youtubedl-user-agent' in req.headers:
+ if 'User-agent' in req.headers:
+ del req.headers['User-agent']
+ req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
+ del req.headers['Youtubedl-user-agent']
return req
def http_response(self, req, resp):
resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
+
+ https_request = http_request
+ https_response = http_response