X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=3e4219b1797b87a451191336c3fc995ef36d354c;hb=d800609c62703e4e6edd2891a8432306462e4db3;hp=9bc292a39a5a30f5bee6d16c9ff8557eccf7dff0;hpb=a6420bf50cc60aa9c8e37bfb2acb10eca361a4f0;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9bc292a39..3e4219b17 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals +import base64 import calendar import codecs import contextlib @@ -35,6 +36,7 @@ import zlib from .compat import ( compat_basestring, compat_chr, + compat_etree_fromstring, compat_html_entities, compat_http_client, compat_kwargs, @@ -54,7 +56,7 @@ from .compat import ( compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -68,6 +70,21 @@ ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] +KNOWN_EXTENSIONS = ( + 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', + 'flv', 'f4v', 'f4a', 'f4b', + 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', + 'mkv', 'mka', 'mk3d', + 'avi', 'divx', + 'mov', + 'asf', 'wmv', 'wma', + '3gp', '3g2', + 'mp3', + 'flac', + 'ape', + 'wav', + 'f4f', 'f4m', 'm3u8', 'smil') + def preferredencoding(): """Get preferred encoding. @@ -177,10 +194,19 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') + def _find_xpath(xpath): + if sys.version_info < (2, 7): # Crazy 2.6 + xpath = xpath.encode('ascii') + return node.find(xpath) + + if isinstance(xpath, (str, compat_str)): + n = _find_xpath(xpath) + else: + for xp in xpath: + n = _find_xpath(xp) + if n is not None: + break - n = node.find(xpath) if n is None: if default is not NO_DEFAULT: return default @@ -355,13 +381,20 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) return os.path.join(*sanitized_path) +# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of +# unwanted failures due to missing protocol +def sanitized_Request(url, *args, **kwargs): + return compat_urllib_request.Request( + 'http:%s' % url if url.startswith('//') else url, *args, **kwargs) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -385,10 +418,14 @@ def _htmlentity_transform(entity): numstr = '0%s' % numstr else: base = 10 - return compat_chr(int(numstr, base)) + # See https://github.com/rg3/youtube-dl/issues/7518 + try: + return compat_chr(int(numstr, base)) + except ValueError: + pass # Unknown entity in name, return its literal representation - return ('&%s;' % entity) + return '&%s;' % entity def unescapeHTML(s): @@ -619,7 +656,7 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): # expected HTTP responses to meet HTTP/1.0 or later (see also # https://github.com/rg3/youtube-dl/issues/6727) if sys.version_info < (3, 0): - kwargs['strict'] = True + kwargs[b'strict'] = True hc = http_class(*args, **kwargs) source_address = ydl_handler._params.get('source_address') if source_address is not None: @@ -641,6 +678,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): return hc +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers + + class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -648,7 +695,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): the standard headers to every HTTP request and handles gzipped and deflated responses from web servers. If compression is to be avoided in a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-No-Compression", which will be + to include the HTTP header "Youtubedl-no-compression", which will be removed before making the real request. Part of this code was copied from: @@ -709,10 +756,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # The dict keys are capitalized because of this bug by urllib if h.capitalize() not in req.headers: req.add_header(h, v) - if 'Youtubedl-no-compression' in req.headers: - if 'Accept-encoding' in req.headers: - del req.headers['Accept-encoding'] - del req.headers['Youtubedl-no-compression'] + + req.headers = handle_youtubedl_headers(req.headers) if sys.version_info < (2, 7) and '#' in req.get_full_url(): # Python 2.6 is brain-dead when it comes to fragments @@ -743,12 +788,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): raise original_ioerror resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg - # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 + del resp.headers['Content-encoding'] + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see + # https://github.com/rg3/youtube-dl/issues/6457). if 300 <= resp.code < 400: location = resp.headers.get('Location') if location: @@ -792,16 +840,14 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): # https://github.com/rg3/youtube-dl/issues/6769). # In order to at least prevent crashing we will percent encode Set-Cookie # header before HTTPCookieProcessor starts processing it. - if sys.version_info < (3, 0) and response.headers: - for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): - set_cookie = response.headers.get(set_cookie_header) - if set_cookie: - set_cookie_escaped = '; '.join([ - escape_rfc3986(cookie_attr.strip()) - for cookie_attr in set_cookie.decode('iso-8859-1').split(';')]).encode('iso-8859-1') - if set_cookie != set_cookie_escaped: - del response.headers[set_cookie_header] - response.headers[set_cookie_header] = set_cookie_escaped + # if sys.version_info < (3, 0) and response.headers: + # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): + # set_cookie = response.headers.get(set_cookie_header) + # if set_cookie: + # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ") + # if set_cookie != set_cookie_escaped: + # del response.headers[set_cookie_header] + # response.headers[set_cookie_header] = set_cookie_escaped return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) https_request = compat_urllib_request.HTTPCookieProcessor.http_request @@ -814,9 +860,11 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): if date_str is None: return None + date_str = re.sub(r'\.[0-9]+', '', date_str) + if timezone is None: m = re.search( - r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + r'(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', date_str) if not m: timezone = datetime.timedelta() @@ -829,9 +877,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): timezone = datetime.timedelta( hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) - date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) + try: + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + except ValueError: + pass def unified_strdate(date_str, day_first=True): @@ -896,7 +947,8 @@ def unified_strdate(date_str, day_first=True): timetuple = email.utils.parsedate_tz(date_str) if timetuple: upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - return upload_date + if upload_date is not None: + return compat_str(upload_date) def determine_ext(url, default_ext='unknown_video'): @@ -905,6 +957,9 @@ def determine_ext(url, default_ext='unknown_video'): guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): return guess + # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download + elif guess.rstrip('/') in KNOWN_EXTENSIONS: + return guess.rstrip('/') else: return default_ext @@ -929,7 +984,7 @@ def date_from_str(date_str): if sign == '-': time = -time unit = match.group('unit') - # A bad aproximation? + # A bad approximation? if unit == 'month': unit = 'day' time *= 30 @@ -1252,7 +1307,7 @@ def parse_filesize(s): if s is None: return None - # The lower-case forms are of course incorrect and inofficial, + # The lower-case forms are of course incorrect and unofficial, # but we support those too _UNIT_TABLE = { 'B': 1, @@ -1356,6 +1411,15 @@ def remove_end(s, end): return s +def remove_quotes(s): + if s is None or len(s) < 2: + return s + for quote in ('"', "'", ): + if s[0] == quote and s[-1] == quote: + return s[1:-1] + return s + + def url_basename(url): path = compat_urlparse.urlparse(url).path return path.strip('/').split('/')[-1] @@ -1372,7 +1436,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): v = getattr(v, get_attr, None) if v == '': v = None - return default if v is None else (int(v) * invscale // scale) + if v is None: + return default + try: + return int(v) * invscale // scale + except ValueError: + return default def str_or_none(v, default=None): @@ -1388,7 +1457,12 @@ def str_to_int(int_str): def float_or_none(v, scale=1, invscale=1, default=None): - return default if v is None else (float(v) * invscale / scale) + if v is None: + return default + try: + return float(v) * invscale / scale + except ValueError: + return default def parse_duration(s): @@ -1637,27 +1711,24 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') -try: - etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError: # Python <=2.6 - etree_iter = lambda n: n.findall('.//*') +def encode_dict(d, encoding='utf-8'): + def encode(v): + return v.encode(encoding) if isinstance(v, compat_basestring) else v + return dict((encode(k), encode(v)) for k, v in d.items()) -def parse_xml(s): - class TreeBuilder(xml.etree.ElementTree.TreeBuilder): - def doctype(self, name, pubid, system): - pass # Ignore doctypes +def dict_get(d, key_or_keys, default=None, skip_false_values=True): + if isinstance(key_or_keys, (list, tuple)): + for key in key_or_keys: + if key not in d or d[key] is None or skip_false_values and not d[key]: + continue + return d[key] + return default + return d.get(key_or_keys, default) - parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) - kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) - # Fix up XML parser in Python 2.x - if sys.version_info < (3, 0): - for n in etree_iter(tree): - if n.text is not None: - if not isinstance(n.text, compat_str): - n.text = n.text.decode('utf-8') - return tree + +def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): + return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) US_RATINGS = { @@ -1673,12 +1744,12 @@ def parse_age_limit(s): if s is None: return None m = re.match(r'^(?P\d{1,2})\+?$', s) - return int(m.group('age')) if m else US_RATINGS.get(s, None) + return int(m.group('age')) if m else US_RATINGS.get(s) def strip_jsonp(code): return re.sub( - r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) + r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) def js_to_json(code): @@ -1687,8 +1758,8 @@ def js_to_json(code): if v in ('true', 'false', 'null'): return v if v.startswith('"'): - return v - if v.startswith("'"): + v = re.sub(r"\\'", "'", v[1:-1]) + elif v.startswith("'"): v = v[1:-1] v = re.sub(r"\\\\|\\'|\"", lambda m: { '\\\\': '\\\\', @@ -1754,13 +1825,24 @@ def args_to_str(args): return ' '.join(shlex_quote(a) for a in args) +def error_to_compat_str(err): + err_str = str(err) + # On python 2 error byte string must be decoded with proper + # encoding rather than ascii + if sys.version_info[0] < 3: + err_str = err_str.decode(preferredencoding()) + return err_str + + def mimetype2ext(mt): _, _, res = mt.rpartition('/') return { - 'x-ms-wmv': 'wmv', - 'x-mp4-fragmented': 'mp4', + '3gpp': '3gp', 'ttml+xml': 'ttml', + 'x-flv': 'flv', + 'x-mp4-fragmented': 'mp4', + 'x-ms-wmv': 'wmv', }.get(res, res) @@ -1782,6 +1864,10 @@ def urlhandle_detect_ext(url_handle): return mimetype2ext(getheader('Content-Type')) +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + def age_restricted(content_limit, age_limit): """ Returns True iff the content should be blocked """ @@ -1920,15 +2006,15 @@ def match_filter_func(filter_str): def parse_dfxp_time_expr(time_expr): if not time_expr: - return 0.0 + return mobj = re.match(r'^(?P\d+(?:\.\d+)?)s?$', time_expr) if mobj: return float(mobj.group('time_offset')) - mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) + mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr) if mobj: - return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) + return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.')) def srt_subtitles_timecode(seconds): @@ -1941,22 +2027,29 @@ def dfxp2srt(dfxp_data): 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', }) - def parse_node(node): - str_or_empty = functools.partial(str_or_none, default='') + class TTMLPElementParser(object): + out = '' - out = str_or_empty(node.text) + def start(self, tag, attrib): + if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): + self.out += '\n' - for child in node: - if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): - out += '\n' + str_or_empty(child.tail) - elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): - out += str_or_empty(parse_node(child)) - else: - out += str_or_empty(xml.etree.ElementTree.tostring(child)) + def end(self, tag): + pass + + def data(self, data): + self.out += data - return out + def close(self): + return self.out.strip() - dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + def parse_node(node): + target = TTMLPElementParser() + parser = xml.etree.ElementTree.XMLParser(target=target) + parser.feed(xml.etree.ElementTree.tostring(node)) + return parser.close() + + dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') @@ -1964,10 +2057,15 @@ def dfxp2srt(dfxp_data): raise ValueError('Invalid dfxp/TTML subtitle') for para, index in zip(paras, itertools.count(1)): - begin_time = parse_dfxp_time_expr(para.attrib['begin']) + begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) end_time = parse_dfxp_time_expr(para.attrib.get('end')) + dur = parse_dfxp_time_expr(para.attrib.get('dur')) + if begin_time is None: + continue if not end_time: - end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) + if not dur: + continue + end_time = begin_time + dur out.append('%d\n%s --> %s\n%s\n\n' % ( index, srt_subtitles_timecode(begin_time),