X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=1279a9042f343eee1dbd8eaf713e3eed06d60d3e;hb=fcca0d53a8fa47614a39a433a3da7d1ab1d88ed9;hp=d2dfa80139e25babab7fef073dc4cfe670ce7c50;hpb=63e0fd5bccf5cfb85e00e5935f4b6961ff26c58c;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d2dfa8013..1279a9042 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals @@ -86,6 +86,11 @@ std_headers = { } +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + + NO_DEFAULT = object() ENGLISH_MONTH_NAMES = [ @@ -123,7 +128,13 @@ DATE_FORMATS = ( '%d %B %Y', '%d %b %Y', '%B %d %Y', + '%B %dst %Y', + '%B %dnd %Y', + '%B %dth %Y', '%b %d %Y', + '%b %dst %Y', + '%b %dnd %Y', + '%b %dth %Y', '%b %dst %Y %I:%M', '%b %dnd %Y %I:%M', '%b %dth %Y %I:%M', @@ -132,6 +143,7 @@ DATE_FORMATS = ( '%Y/%m/%d', '%Y/%m/%d %H:%M', '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', @@ -165,6 +177,8 @@ DATE_FORMATS_MONTH_FIRST.extend([ '%m/%d/%Y %H:%M:%S', ]) +PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" + def preferredencoding(): """Get preferred encoding. @@ -323,17 +337,30 @@ def get_element_by_id(id, html): def get_element_by_class(class_name, html): - return get_element_by_attribute( + """Return the content of the first tag with the specified class in the passed HTML document""" + retval = get_elements_by_class(class_name, html) + return retval[0] if retval else None + + +def get_element_by_attribute(attribute, value, html, escape_value=True): + retval = get_elements_by_attribute(attribute, value, html, escape_value) + return retval[0] if retval else None + + +def get_elements_by_class(class_name, html): + """Return the content of all tags with the specified class in the passed HTML document as a list""" + return get_elements_by_attribute( 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), html, escape_value=False) -def get_element_by_attribute(attribute, value, html, escape_value=True): +def get_elements_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" value = re.escape(value) if escape_value else value - m = re.search(r'''(?xs) + retlist = [] + for m in re.finditer(r'''(?xs) <([a-zA-Z0-9:._-]+) (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s+%s=['"]?%s['"]? @@ -341,16 +368,15 @@ def get_element_by_attribute(attribute, value, html, escape_value=True): \s*> (?P.*?) - ''' % (re.escape(attribute), value), html) + ''' % (re.escape(attribute), value), html): + res = m.group('content') - if not m: - return None - res = m.group('content') + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] - if res.startswith('"') or res.startswith("'"): - res = res[1:-1] + retlist.append(unescapeHTML(res)) - return unescapeHTML(res) + return retlist class HTMLAttributeParser(compat_HTMLParser): @@ -494,7 +520,7 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) @@ -782,6 +808,7 @@ class XAttrMetadataError(Exception): def __init__(self, code=None, msg='Unknown error'): super(XAttrMetadataError, self).__init__(msg) self.code = code + self.msg = msg # Parsing code and msg if (self.code in (errno.ENOSPC, errno.EDQUOT) or @@ -1175,7 +1202,7 @@ def date_from_str(date_str): return today if date_str == 'yesterday': return today - datetime.timedelta(days=1) - match = re.match('(now|today)(?P[+-])(?P