X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=8e53962c95172688b5b64152b6b33a383d29a029;hb=0d778b1db909c8d096be4e199384fff96a722fc9;hp=ec186918cd8672ada2da2d5521e0ba8b22eb273d;hpb=83548824c29ccdf53a4659260aa3898939833882;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ec186918c..8e53962c9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -47,9 +47,11 @@ from .compat import ( compat_str, compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xpath, shlex_quote, ) @@ -165,12 +167,7 @@ if sys.version_info >= (2, 7): return node.find(expr) else: def find_xpath_attr(node, xpath, key, val=None): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - - for f in node.findall(xpath): + for f in node.findall(compat_xpath(xpath)): if key not in f.attrib: continue if val is None or f.attrib.get(key) == val: @@ -195,9 +192,7 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') - return node.find(xpath) + return node.find(compat_xpath(xpath)) if isinstance(xpath, (str, compat_str)): n = _find_xpath(xpath) @@ -273,15 +268,17 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) + class HTMLAttributeParser(compat_HTMLParser): """Trivial HTML parser to gather the attributes for a single element""" def __init__(self): - self.attrs = { } + self.attrs = {} compat_HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): self.attrs = dict(attrs) + def extract_attributes(html_element): """Given a string for an HTML element such as [0-9]+(?:[,.][0-9]*)?)\s*(?P%s)' % units_re, s) + r'(?P[0-9]+(?:[,.][0-9]*)?)\s*(?P%s)\b' % units_re, s) if not m: return None num_str = m.group('num').replace(',', '.') @@ -1749,6 +1745,7 @@ def escape_url(url): """Escape URL as suggested by RFC 3986""" url_parsed = compat_urllib_parse_urlparse(url) return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), path=escape_rfc3986(url_parsed.path), params=escape_rfc3986(url_parsed.params), query=escape_rfc3986(url_parsed.query), @@ -1758,7 +1755,8 @@ def escape_url(url): try: struct.pack('!I', 0) except TypeError: - # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument + # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument + # See https://bugs.python.org/issue19099 def struct_pack(spec, *args): if isinstance(spec, compat_str): spec = spec.encode('ascii') @@ -1790,22 +1788,29 @@ def read_batch_urls(batch_fd): def urlencode_postdata(*args, **kargs): - return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') + return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') def update_url_query(url, query): parsed_url = compat_urlparse.urlparse(url) qs = compat_parse_qs(parsed_url.query) qs.update(query) - qs = encode_dict(qs) return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse.urlencode(qs, True))) + query=compat_urllib_parse_urlencode(qs, True))) -def encode_dict(d, encoding='utf-8'): - def encode(v): - return v.encode(encoding) if isinstance(v, compat_basestring) else v - return dict((encode(k), encode(v)) for k, v in d.items()) +def update_Request(req, url=None, data=None, headers={}, query={}): + req_headers = req.headers.copy() + req_headers.update(headers) + req_data = data or req.data + req_url = update_url_query(url or req.get_full_url(), query) + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + new_req = req_type( + req_url, data=req_data, headers=req_headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + if hasattr(req, 'timeout'): + new_req.timeout = req.timeout + return new_req def dict_get(d, key_or_keys, default=None, skip_false_values=True): @@ -2126,6 +2131,7 @@ def dfxp2srt(dfxp_data): _x = functools.partial(xpath_with_ns, ns_map={ 'ttml': 'http://www.w3.org/ns/ttml', 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', + 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1', }) class TTMLPElementParser(object): @@ -2152,7 +2158,7 @@ def dfxp2srt(dfxp_data): dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p') if not paras: raise ValueError('Invalid dfxp/TTML subtitle')