X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=067b8a184c67ea152e3769bcdbcc63bf43afca77;hb=8b0d7a66ef5451556bb8ae5b085c7bef4c992f8b;hp=9fd0ec8d5856cbee27534c0f4a02cc90b05f8389;hpb=8fb754bcd023953a1e55e7acb8c6aea9edef937d;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9fd0ec8d5..067b8a184 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParser, compat_basestring, compat_chr, compat_etree_fromstring, @@ -49,6 +50,7 @@ from .compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xpath, shlex_quote, ) @@ -164,12 +166,7 @@ if sys.version_info >= (2, 7): return node.find(expr) else: def find_xpath_attr(node, xpath, key, val=None): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - - for f in node.findall(xpath): + for f in node.findall(compat_xpath(xpath)): if key not in f.attrib: continue if val is None or f.attrib.get(key) == val: @@ -194,9 +191,7 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') - return node.find(xpath) + return node.find(compat_xpath(xpath)) if isinstance(xpath, (str, compat_str)): n = _find_xpath(xpath) @@ -273,6 +268,38 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = {} + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + + +def extract_attributes(html_element): + """Given a string for an HTML element such as + + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs + + def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -1319,7 +1346,7 @@ def format_bytes(bytes): def lookup_unit_table(unit_table, s): units_re = '|'.join(re.escape(u) for u in unit_table) m = re.match( - r'(?P[0-9]+(?:[,.][0-9]*)?)\s*(?P%s)' % units_re, s) + r'(?P[0-9]+(?:[,.][0-9]*)?)\s*(?P%s)\b' % units_re, s) if not m: return None num_str = m.group('num').replace(',', '.')