X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=d7b737e21639679fe665d22e6ca4089f5c791618;hb=5c2266df4b9aeb7881ed8c026a038e2a25e43734;hp=79381b3803717081b15c79b26cf9bf4173f18ef6;hpb=027eb5a6b041a91ca7fdd61826daaea24bec1cfb;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 79381b380..d7b737e21 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals +import base64 import calendar import codecs import contextlib @@ -35,6 +36,7 @@ import zlib from .compat import ( compat_basestring, compat_chr, + compat_etree_fromstring, compat_html_entities, compat_http_client, compat_kwargs, @@ -177,10 +179,19 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') + def _find_xpath(xpath): + if sys.version_info < (2, 7): # Crazy 2.6 + xpath = xpath.encode('ascii') + return node.find(xpath) + + if isinstance(xpath, (str, compat_str)): + n = _find_xpath(xpath) + else: + for xp in xpath: + n = _find_xpath(xp) + if n is not None: + break - n = node.find(xpath) if n is None: if default is not NO_DEFAULT: return default @@ -355,13 +366,20 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) return os.path.join(*sanitized_path) +# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of +# unwanted failures due to missing protocol +def sanitized_Request(url, *args, **kwargs): + return compat_urllib_request.Request( + 'http:%s' % url if url.startswith('//') else url, *args, **kwargs) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -385,10 +403,14 @@ def _htmlentity_transform(entity): numstr = '0%s' % numstr else: base = 10 - return compat_chr(int(numstr, base)) + # See https://github.com/rg3/youtube-dl/issues/7518 + try: + return compat_chr(int(numstr, base)) + except ValueError: + pass # Unknown entity in name, return its literal representation - return ('&%s;' % entity) + return '&%s;' % entity def unescapeHTML(s): @@ -619,7 +641,7 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): # expected HTTP responses to meet HTTP/1.0 or later (see also # https://github.com/rg3/youtube-dl/issues/6727) if sys.version_info < (3, 0): - kwargs['strict'] = True + kwargs[b'strict'] = True hc = http_class(*args, **kwargs) source_address = ydl_handler._params.get('source_address') if source_address is not None: @@ -748,7 +770,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg - # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see + # https://github.com/rg3/youtube-dl/issues/6457). if 300 <= resp.code < 400: location = resp.headers.get('Location') if location: @@ -782,15 +805,41 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): req, **kwargs) +class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): + def __init__(self, cookiejar=None): + compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) + + def http_response(self, request, response): + # Python 2 will choke on next HTTP request in row if there are non-ASCII + # characters in Set-Cookie HTTP header of last response (see + # https://github.com/rg3/youtube-dl/issues/6769). + # In order to at least prevent crashing we will percent encode Set-Cookie + # header before HTTPCookieProcessor starts processing it. + # if sys.version_info < (3, 0) and response.headers: + # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): + # set_cookie = response.headers.get(set_cookie_header) + # if set_cookie: + # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ") + # if set_cookie != set_cookie_escaped: + # del response.headers[set_cookie_header] + # response.headers[set_cookie_header] = set_cookie_escaped + return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) + + https_request = compat_urllib_request.HTTPCookieProcessor.http_request + https_response = http_response + + def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ if date_str is None: return None + date_str = re.sub(r'\.[0-9]+', '', date_str) + if timezone is None: m = re.search( - r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + r'(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', date_str) if not m: timezone = datetime.timedelta() @@ -803,9 +852,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): timezone = datetime.timedelta( hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) - date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) + try: + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + except ValueError: + pass def unified_strdate(date_str, day_first=True): @@ -870,7 +922,8 @@ def unified_strdate(date_str, day_first=True): timetuple = email.utils.parsedate_tz(date_str) if timetuple: upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - return upload_date + if upload_date is not None: + return compat_str(upload_date) def determine_ext(url, default_ext='unknown_video'): @@ -879,6 +932,21 @@ def determine_ext(url, default_ext='unknown_video'): guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): return guess + elif guess.rstrip('/') in ( + 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', + 'flv', 'f4v', 'f4a', 'f4b', + 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', + 'mkv', 'mka', 'mk3d', + 'avi', 'divx', + 'mov', + 'asf', 'wmv', 'wma', + '3gp', '3g2', + 'mp3', + 'flac', + 'ape', + 'wav', + 'f4f', 'f4m', 'm3u8', 'smil'): + return guess.rstrip('/') else: return default_ext @@ -1346,7 +1414,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): v = getattr(v, get_attr, None) if v == '': v = None - return default if v is None else (int(v) * invscale // scale) + if v is None: + return default + try: + return int(v) * invscale // scale + except ValueError: + return default def str_or_none(v, default=None): @@ -1362,7 +1435,12 @@ def str_to_int(int_str): def float_or_none(v, scale=1, invscale=1, default=None): - return default if v is None else (float(v) * invscale / scale) + if v is None: + return default + try: + return float(v) * invscale / scale + except ValueError: + return default def parse_duration(s): @@ -1611,27 +1689,10 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') -try: - etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError: # Python <=2.6 - etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): - class TreeBuilder(xml.etree.ElementTree.TreeBuilder): - def doctype(self, name, pubid, system): - pass # Ignore doctypes - - parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) - kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) - # Fix up XML parser in Python 2.x - if sys.version_info < (3, 0): - for n in etree_iter(tree): - if n.text is not None: - if not isinstance(n.text, compat_str): - n.text = n.text.decode('utf-8') - return tree +def encode_dict(d, encoding='utf-8'): + def encode(v): + return v.encode(encoding) if isinstance(v, compat_basestring) else v + return dict((encode(k), encode(v)) for k, v in d.items()) US_RATINGS = { @@ -1661,8 +1722,8 @@ def js_to_json(code): if v in ('true', 'false', 'null'): return v if v.startswith('"'): - return v - if v.startswith("'"): + v = re.sub(r"\\'", "'", v[1:-1]) + elif v.startswith("'"): v = v[1:-1] v = re.sub(r"\\\\|\\'|\"", lambda m: { '\\\\': '\\\\', @@ -1756,6 +1817,10 @@ def urlhandle_detect_ext(url_handle): return mimetype2ext(getheader('Content-Type')) +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + def age_restricted(content_limit, age_limit): """ Returns True iff the content should be blocked """ @@ -1930,7 +1995,7 @@ def dfxp2srt(dfxp_data): return out - dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')