X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=b6e1dc8099f8d3e8f513b38303a282000e4932df;hb=f07e276a04292c3fa87f703931bad9b716e7ccdf;hp=9fd0ec8d5856cbee27534c0f4a02cc90b05f8389;hpb=49dea4913bea3b8e5c7d65dd932aa68ada526088;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9fd0ec8d5..b6e1dc809 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParser, compat_basestring, compat_chr, compat_etree_fromstring, @@ -49,6 +50,7 @@ from .compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xpath, shlex_quote, ) @@ -164,12 +166,7 @@ if sys.version_info >= (2, 7): return node.find(expr) else: def find_xpath_attr(node, xpath, key, val=None): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - - for f in node.findall(xpath): + for f in node.findall(compat_xpath(xpath)): if key not in f.attrib: continue if val is None or f.attrib.get(key) == val: @@ -194,9 +191,7 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') - return node.find(xpath) + return node.find(compat_xpath(xpath)) if isinstance(xpath, (str, compat_str)): n = _find_xpath(xpath) @@ -273,6 +268,38 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = {} + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + + +def extract_attributes(html_element): + """Given a string for an HTML element such as + + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs + + def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -1319,7 +1346,7 @@ def format_bytes(bytes): def lookup_unit_table(unit_table, s): units_re = '|'.join(re.escape(u) for u in unit_table) m = re.match( - r'(?P[0-9]+(?:[,.][0-9]*)?)\s*(?P%s)' % units_re, s) + r'(?P[0-9]+(?:[,.][0-9]*)?)\s*(?P%s)\b' % units_re, s) if not m: return None num_str = m.group('num').replace(',', '.') @@ -1719,6 +1746,7 @@ def escape_url(url): """Escape URL as suggested by RFC 3986""" url_parsed = compat_urllib_parse_urlparse(url) return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), path=escape_rfc3986(url_parsed.path), params=escape_rfc3986(url_parsed.params), query=escape_rfc3986(url_parsed.query), @@ -1728,7 +1756,8 @@ def escape_url(url): try: struct.pack('!I', 0) except TypeError: - # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument + # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument + # See https://bugs.python.org/issue19099 def struct_pack(spec, *args): if isinstance(spec, compat_str): spec = spec.encode('ascii')