X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=768c6207df5e32728c95ccc4a1cfa5e05d9088c3;hb=a921f40799d2ecb4be53b3241d2dbfc80f804d73;hp=64ab3091070013f840e44bf6c29a808fa75c47d1;hpb=204da0d3e3cc5f8675f10f44d0717e210405f8ca;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 64ab30910..768c6207d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -213,7 +213,7 @@ if sys.version_info >= (2,7): def find_xpath_attr(node, xpath, key, val): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z]+$', key) - assert re.match(r'^[a-zA-Z@\s]*$', val) + assert re.match(r'^[a-zA-Z0-9@\s]*$', val) expr = xpath + u"[@%s='%s']" % (key, val) return node.find(expr) else: @@ -249,7 +249,17 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class AttrParser(compat_html_parser.HTMLParser): +class BaseHTMLParser(compat_html_parser.HTMLParser): + def __init(self): + compat_html_parser.HTMLParser.__init__(self) + self.html = None + + def loads(self, html): + self.html = html + self.feed(html) + self.close() + +class AttrParser(BaseHTMLParser): """Modified HTMLParser that isolates a tag with the specified attribute""" def __init__(self, attribute, value): self.attribute = attribute @@ -257,10 +267,9 @@ class AttrParser(compat_html_parser.HTMLParser): self.result = None self.started = False self.depth = {} - self.html = None self.watch_startpos = False self.error_count = 0 - compat_html_parser.HTMLParser.__init__(self) + BaseHTMLParser.__init__(self) def error(self, message): if self.error_count > 10 or self.started: @@ -269,11 +278,6 @@ class AttrParser(compat_html_parser.HTMLParser): self.error_count += 1 self.goahead(1) - def loads(self, html): - self.html = html - self.feed(html) - self.close() - def handle_starttag(self, tag, attrs): attrs = dict(attrs) if self.started: @@ -334,6 +338,38 @@ def get_element_by_attribute(attribute, value, html): pass return parser.get_result() +class MetaParser(BaseHTMLParser): + """ + Modified HTMLParser that isolates a meta tag with the specified name + attribute. + """ + def __init__(self, name): + BaseHTMLParser.__init__(self) + self.name = name + self.content = None + self.result = None + + def handle_starttag(self, tag, attrs): + if tag != 'meta': + return + attrs = dict(attrs) + if attrs.get('name') == self.name: + self.result = attrs.get('content') + + def get_result(self): + return self.result + +def get_meta_content(name, html): + """ + Return the content attribute from the meta tag with the given name attribute. + """ + parser = MetaParser(name) + try: + parser.loads(html) + except compat_html_parser.HTMLParseError: + pass + return parser.get_result() + def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -743,3 +779,21 @@ def platform_name(): assert isinstance(res, compat_str) return res + + +def bytes_to_intlist(bs): + if not bs: + return [] + if isinstance(bs[0], int): # Python 3 + return list(bs) + else: + return [ord(c) for c in bs] + + +def intlist_to_bytes(xs): + if not xs: + return b'' + if isinstance(chr(0), bytes): # Python 2 + return ''.join([chr(x) for x in xs]) + else: + return bytes(xs)