X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=ec186918cd8672ada2da2d5521e0ba8b22eb273d;hb=83548824c29ccdf53a4659260aa3898939833882;hp=dc0bf5627a5d149da4790b1f2d938649ed7924f9;hpb=19a17d462388b5d10196352e5f9d1491d608783a;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index dc0bf5627..ec186918c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParser, compat_basestring, compat_chr, compat_etree_fromstring, @@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = { } + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + +def extract_attributes(html_element): + """Given a string for an HTML element such as + + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -1316,6 +1346,17 @@ def format_bytes(bytes): return '%.2f%s' % (converted, suffix) +def lookup_unit_table(unit_table, s): + units_re = '|'.join(re.escape(u) for u in unit_table) + m = re.match( + r'(?P[0-9]+(?:[,.][0-9]*)?)\s*(?P%s)' % units_re, s) + if not m: + return None + num_str = m.group('num').replace(',', '.') + mult = unit_table[m.group('unit')] + return int(float(num_str) * mult) + + def parse_filesize(s): if s is None: return None @@ -1359,15 +1400,28 @@ def parse_filesize(s): 'Yb': 1000 ** 8, } - units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE) - m = re.match( - r'(?P[0-9]+(?:[,.][0-9]*)?)\s*(?P%s)' % units_re, s) - if not m: + return lookup_unit_table(_UNIT_TABLE, s) + + +def parse_count(s): + if s is None: return None - num_str = m.group('num').replace(',', '.') - mult = _UNIT_TABLE[m.group('unit')] - return int(float(num_str) * mult) + s = s.strip() + + if re.match(r'^[\d,.]+$', s): + return str_to_int(s) + + _UNIT_TABLE = { + 'k': 1000, + 'K': 1000, + 'm': 1000 ** 2, + 'M': 1000 ** 2, + 'kk': 1000 ** 2, + 'KK': 1000 ** 2, + } + + return lookup_unit_table(_UNIT_TABLE, s) def month_by_name(name): @@ -1893,19 +1947,6 @@ def mimetype2ext(mt): }.get(res, res) -def codec2ext(codec): - codec_type = codec.split('.')[0] - - # Leave the return value None for unknown values as codec_type - # is not a good fallback for file extensions - return { - 'avc1': 'mp4', - 'mp4a': 'm4a', - 'vorbis': 'webm', - 'vp9': 'webm', - }.get(codec_type) - - def urlhandle_detect_ext(url_handle): try: url_handle.headers