X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=017f06c42e9a019e18e25480c5e5d8d3aaaef335;hb=e3700fc9e44c7820e1c38264c84a315c5f91bb2d;hp=44f939053adf5fcf6a2ab1ad4cb9c0eac5cedd7a;hpb=3c6ffbaedbbae8734f6b86fea1169413b656abf3;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 44f939053..017f06c42 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,10 +3,12 @@ import gzip import io +import json import locale import os import re import sys +import traceback import zlib import email.utils import json @@ -51,6 +53,12 @@ try: except ImportError: # Python 2 import httplib as compat_http_client +try: + from subprocess import DEVNULL + compat_subprocess_get_DEVNULL = lambda: DEVNULL +except ImportError: + compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') + try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 @@ -147,6 +155,7 @@ std_headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', } + def preferredencoding(): """Get preferred encoding. @@ -169,6 +178,17 @@ else: assert type(s) == type(u'') print(s) +# In Python 2.x, json.dump expects a bytestream. +# In Python 3.x, it writes to a character stream +if sys.version_info < (3,0): + def write_json_file(obj, fn): + with open(fn, 'wb') as f: + json.dump(obj, f) +else: + def write_json_file(obj, fn): + with open(fn, 'w', encoding='utf-8') as f: + json.dump(obj, f) + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. @@ -195,10 +215,11 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class IDParser(compat_html_parser.HTMLParser): - """Modified HTMLParser that isolates a tag with the specified id""" - def __init__(self, id): - self.id = id +class AttrParser(compat_html_parser.HTMLParser): + """Modified HTMLParser that isolates a tag with the specified attribute""" + def __init__(self, attribute, value): + self.attribute = attribute + self.value = value self.result = None self.started = False self.depth = {} @@ -223,7 +244,7 @@ class IDParser(compat_html_parser.HTMLParser): attrs = dict(attrs) if self.started: self.find_startpos(None) - if 'id' in attrs and attrs['id'] == self.id: + if self.attribute in attrs and attrs[self.attribute] == self.value: self.result = [tag] self.started = True self.watch_startpos = True @@ -259,10 +280,20 @@ class IDParser(compat_html_parser.HTMLParser): lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] lines[-1] = lines[-1][:self.result[2][1]] return '\n'.join(lines).strip() +# Hack for https://github.com/rg3/youtube-dl/issues/662 +if sys.version_info < (2, 7, 3): + AttrParser.parse_endtag = (lambda self, i: + i + len("") + if self.rawdata[i:].startswith("") + else compat_html_parser.HTMLParser.parse_endtag(self, i)) def get_element_by_id(id, html): - """Return the content of the tag with the specified id in the passed HTML document""" - parser = IDParser(id) + """Return the content of the tag with the specified ID in the passed HTML document""" + return get_element_by_attribute("id", id, html) + +def get_element_by_attribute(attribute, value, html): + """Return the content of the tag with the specified attribute in the passed HTML document""" + parser = AttrParser(attribute, value) try: parser.loads(html) except compat_html_parser.HTMLParseError: @@ -274,12 +305,13 @@ def clean_html(html): """Clean an HTML snippet into a readable string""" # Newline vs
html = html.replace('\n', ' ') - html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities html = unescapeHTML(html) - return html + return html.strip() def sanitize_open(filename, open_mode): @@ -297,7 +329,7 @@ def sanitize_open(filename, open_mode): if sys.platform == 'win32': import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) - return (sys.stdout, filename) + return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) stream = open(encodeFilename(filename), open_mode) return (stream, filename) except (IOError, OSError) as err: @@ -383,7 +415,33 @@ def encodeFilename(s): # match Windows 9x series as well. Besides, NT 4 is obsolete.) return s else: - return s.encode(sys.getfilesystemencoding(), 'ignore') + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return s.encode(encoding, 'ignore') + +def decodeOption(optval): + if optval is None: + return optval + if isinstance(optval, bytes): + optval = optval.decode(preferredencoding()) + + assert isinstance(optval, compat_str) + return optval + +class ExtractorError(Exception): + """Error during info extraction.""" + def __init__(self, msg, tb=None): + """ tb, if given, is the original traceback (so that it can be printed out). """ + super(ExtractorError, self).__init__(msg) + self.traceback = tb + self.exc_info = sys.exc_info() # preserve original exception + + def format_traceback(self): + if self.traceback is None: + return None + return u''.join(traceback.format_tb(self.traceback)) + class DownloadError(Exception): """Download Error exception. @@ -392,7 +450,10 @@ class DownloadError(Exception): configured to continue on errors. They will contain the appropriate error message. """ - pass + def __init__(self, msg, exc_info=None): + """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ + super(DownloadError, self).__init__(msg) + self.exc_info = exc_info class SameFileError(Exception): @@ -410,7 +471,8 @@ class PostProcessingError(Exception): This exception may be raised by PostProcessor's .run() method to indicate an error in the postprocessing task. """ - pass + def __init__(self, msg): + self.msg = msg class MaxDownloadsReached(Exception): """ --max-downloads limit has been reached. """ @@ -441,14 +503,6 @@ class ContentTooShortError(Exception): self.downloaded = downloaded self.expected = expected - -class Trouble(Exception): - """Trouble helper exception - - This is an exception to be handled with - FileDownloader.trouble - """ - class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -483,14 +537,19 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): return ret def http_request(self, req): - for h in std_headers: + for h,v in std_headers.items(): if h in req.headers: del req.headers[h] - req.add_header(h, std_headers[h]) + req.add_header(h, v) if 'Youtubedl-no-compression' in req.headers: if 'Accept-encoding' in req.headers: del req.headers['Accept-encoding'] del req.headers['Youtubedl-no-compression'] + if 'Youtubedl-user-agent' in req.headers: + if 'User-agent' in req.headers: + del req.headers['User-agent'] + req.headers['User-agent'] = req.headers['Youtubedl-user-agent'] + del req.headers['Youtubedl-user-agent'] return req def http_response(self, req, resp):