X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2Futils.py;h=463804e183117b23efd8f0e4b0ad9b132e519ddb;hb=3bb6165927c277c3af73d5ef1ffb6ce9ea663d10;hp=4dcf18991487d7a180a5d4b903bad4b38b3165db;hpb=796173d08b514182eedc704541eb55d5c9e1dc0d;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4dcf18991..463804e18 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ import gzip import io +import json import locale import os import re @@ -51,6 +52,12 @@ try: except ImportError: # Python 2 import httplib as compat_http_client +try: + from subprocess import DEVNULL + compat_subprocess_get_DEVNULL = lambda: DEVNULL +except ImportError: + compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') + try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 @@ -169,6 +176,18 @@ else: assert type(s) == type(u'') print(s) +# In Python 2.x, json.dump expects a bytestream. +# In Python 3.x, it writes to a character stream +if sys.version_info < (3,0): + def write_json_file(obj, fn): + with open(fn, 'wb') as f: + json.dump(obj, f) +else: + def write_json_file(obj, fn): + with open(fn, 'w', encoding='utf-8') as f: + json.dump(obj, f) + + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. @@ -195,10 +214,11 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class IDParser(compat_html_parser.HTMLParser): - """Modified HTMLParser that isolates a tag with the specified id""" - def __init__(self, id): - self.id = id +class AttrParser(compat_html_parser.HTMLParser): + """Modified HTMLParser that isolates a tag with the specified attribute""" + def __init__(self, attribute, value): + self.attribute = attribute + self.value = value self.result = None self.started = False self.depth = {} @@ -223,7 +243,7 @@ class IDParser(compat_html_parser.HTMLParser): attrs = dict(attrs) if self.started: self.find_startpos(None) - if 'id' in attrs and attrs['id'] == self.id: + if self.attribute in attrs and attrs[self.attribute] == self.value: self.result = [tag] self.started = True self.watch_startpos = True @@ -261,8 +281,12 @@ class IDParser(compat_html_parser.HTMLParser): return '\n'.join(lines).strip() def get_element_by_id(id, html): - """Return the content of the tag with the specified id in the passed HTML document""" - parser = IDParser(id) + """Return the content of the tag with the specified ID in the passed HTML document""" + return get_element_by_attribute("id", id, html) + +def get_element_by_attribute(attribute, value, html): + """Return the content of the tag with the specified attribute in the passed HTML document""" + parser = AttrParser(attribute, value) try: parser.loads(html) except compat_html_parser.HTMLParseError: @@ -274,7 +298,8 @@ def clean_html(html): """Clean an HTML snippet into a readable string""" # Newline vs
html = html.replace('\n', ' ') - html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities @@ -441,14 +466,6 @@ class ContentTooShortError(Exception): self.downloaded = downloaded self.expected = expected - -class Trouble(Exception): - """Trouble helper exception - - This is an exception to be handled with - FileDownloader.trouble - """ - class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -506,3 +523,6 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp + + https_request = http_request + https_response = http_response