X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=672ce05ea17b9d0ab9c009c383810ff54a97dd10;hb=611c1dd96efc36a788475e14cc4de64d554d28a0;hp=9a2dd14390bec0d3f7a694be7d4fcc5f1803bec7;hpb=8e60dc7526596f456c0b5d7dc48daa6cae08ebb6;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9a2dd1439..672ce05ea 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -56,7 +56,7 @@ from .compat import ( compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -70,6 +70,21 @@ ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] +KNOWN_EXTENSIONS = ( + 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', + 'flv', 'f4v', 'f4a', 'f4b', + 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', + 'mkv', 'mka', 'mk3d', + 'avi', 'divx', + 'mov', + 'asf', 'wmv', 'wma', + '3gp', '3g2', + 'mp3', + 'flac', + 'ape', + 'wav', + 'f4f', 'f4m', 'm3u8', 'smil') + def preferredencoding(): """Get preferred encoding. @@ -233,7 +248,7 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): def get_element_by_id(id, html): """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute("id", id, html) + return get_element_by_attribute('id', id, html) def get_element_by_attribute(attribute, value, html): @@ -773,11 +788,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): raise original_ioerror resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + del resp.headers['Content-encoding'] # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see # https://github.com/rg3/youtube-dl/issues/6457). if 300 <= resp.code < 400: @@ -940,20 +957,8 @@ def determine_ext(url, default_ext='unknown_video'): guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): return guess - elif guess.rstrip('/') in ( - 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', - 'flv', 'f4v', 'f4a', 'f4b', - 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', - 'mkv', 'mka', 'mk3d', - 'avi', 'divx', - 'mov', - 'asf', 'wmv', 'wma', - '3gp', '3g2', - 'mp3', - 'flac', - 'ape', - 'wav', - 'f4f', 'f4m', 'm3u8', 'smil'): + # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download + elif guess.rstrip('/') in KNOWN_EXTENSIONS: return guess.rstrip('/') else: return default_ext @@ -979,7 +984,7 @@ def date_from_str(date_str): if sign == '-': time = -time unit = match.group('unit') - # A bad aproximation? + # A bad approximation? if unit == 'month': unit = 'day' time *= 30 @@ -989,7 +994,7 @@ def date_from_str(date_str): unit += 's' delta = datetime.timedelta(**{unit: time}) return today + delta - return datetime.datetime.strptime(date_str, "%Y%m%d").date() + return datetime.datetime.strptime(date_str, '%Y%m%d').date() def hyphenate_date(date_str): @@ -1069,22 +1074,22 @@ def _windows_write_string(s, out): GetStdHandle = ctypes.WINFUNCTYPE( ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( - (b"GetStdHandle", ctypes.windll.kernel32)) + (b'GetStdHandle', ctypes.windll.kernel32)) h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) WriteConsoleW = ctypes.WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), - ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32)) + ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32)) written = ctypes.wintypes.DWORD(0) - GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32)) + GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32)) FILE_TYPE_CHAR = 0x0002 FILE_TYPE_REMOTE = 0x8000 GetConsoleMode = ctypes.WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.POINTER(ctypes.wintypes.DWORD))( - (b"GetConsoleMode", ctypes.windll.kernel32)) + (b'GetConsoleMode', ctypes.windll.kernel32)) INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value def not_a_console(handle): @@ -1302,7 +1307,7 @@ def parse_filesize(s): if s is None: return None - # The lower-case forms are of course incorrect and inofficial, + # The lower-case forms are of course incorrect and unofficial, # but we support those too _UNIT_TABLE = { 'B': 1, @@ -1382,7 +1387,7 @@ def fix_xml_ampersands(xml_str): def setproctitle(title): assert isinstance(title, compat_str) try: - libc = ctypes.cdll.LoadLibrary("libc.so.6") + libc = ctypes.cdll.LoadLibrary('libc.so.6') except OSError: return title_bytes = title.encode('utf-8') @@ -1422,7 +1427,7 @@ def url_basename(url): class HEADRequest(compat_urllib_request.Request): def get_method(self): - return "HEAD" + return 'HEAD' def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): @@ -1712,6 +1717,16 @@ def encode_dict(d, encoding='utf-8'): return dict((encode(k), encode(v)) for k, v in d.items()) +def dict_get(d, key_or_keys, default=None, skip_false_values=True): + if isinstance(key_or_keys, (list, tuple)): + for key in key_or_keys: + if key not in d or d[key] is None or skip_false_values and not d[key]: + continue + return d[key] + return default + return d.get(key_or_keys, default) + + def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) @@ -1729,12 +1744,12 @@ def parse_age_limit(s): if s is None: return None m = re.match(r'^(?P\d{1,2})\+?$', s) - return int(m.group('age')) if m else US_RATINGS.get(s, None) + return int(m.group('age')) if m else US_RATINGS.get(s) def strip_jsonp(code): return re.sub( - r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) + r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) def js_to_json(code): @@ -1810,7 +1825,7 @@ def args_to_str(args): return ' '.join(shlex_quote(a) for a in args) -def error_to_str(err): +def error_to_compat_str(err): err_str = str(err) # On python 2 error byte string must be decoded with proper # encoding rather than ascii @@ -1823,9 +1838,11 @@ def mimetype2ext(mt): _, _, res = mt.rpartition('/') return { - 'x-ms-wmv': 'wmv', - 'x-mp4-fragmented': 'mp4', + '3gpp': '3gp', 'ttml+xml': 'ttml', + 'x-flv': 'flv', + 'x-mp4-fragmented': 'mp4', + 'x-ms-wmv': 'wmv', }.get(res, res) @@ -2010,20 +2027,27 @@ def dfxp2srt(dfxp_data): 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', }) - def parse_node(node): - str_or_empty = functools.partial(str_or_none, default='') + class TTMLPElementParser(object): + out = '' - out = str_or_empty(node.text) + def start(self, tag, attrib): + if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): + self.out += '\n' - for child in node: - if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): - out += '\n' + str_or_empty(child.tail) - elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): - out += str_or_empty(parse_node(child)) - else: - out += str_or_empty(xml.etree.ElementTree.tostring(child)) + def end(self, tag): + pass - return out + def data(self, data): + self.out += data + + def close(self): + return self.out.strip() + + def parse_node(node): + target = TTMLPElementParser() + parser = xml.etree.ElementTree.XMLParser(target=target) + parser.feed(xml.etree.ElementTree.tostring(node)) + return parser.close() dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = []