4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
38 compat_HTMLParseError,
43 compat_ctypes_WINFUNCTYPE,
44 compat_etree_fromstring,
47 compat_html_entities_html5,
58 compat_urllib_parse_urlencode,
59 compat_urllib_parse_urlparse,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_request,
72 def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
81 # This is not clearly defined otherwise
82 compiled_regex_type = type(re.compile(''))
85 def random_user_agent():
86 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1665 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
1669 'User-Agent': random_user_agent(),
1670 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1671 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1672 'Accept-Encoding': 'gzip, deflate',
1673 'Accept-Language': 'en-us,en;q=0.5',
1678 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1682 NO_DEFAULT = object()
1684 ENGLISH_MONTH_NAMES = [
1685 'January', 'February', 'March', 'April', 'May', 'June',
1686 'July', 'August', 'September', 'October', 'November', 'December']
1689 'en': ENGLISH_MONTH_NAMES,
1691 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1692 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1695 KNOWN_EXTENSIONS = (
1696 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1697 'flv', 'f4v', 'f4a', 'f4b',
1698 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1699 'mkv', 'mka', 'mk3d',
1702 'asf', 'wmv', 'wma',
1708 'f4f', 'f4m', 'm3u8', 'smil')
1710 # needed for sanitizing filenames in restricted mode
1711 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1712 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1713 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1736 '%Y/%m/%d %H:%M:%S',
1738 '%Y-%m-%d %H:%M:%S',
1739 '%Y-%m-%d %H:%M:%S.%f',
1742 '%Y-%m-%dT%H:%M:%SZ',
1743 '%Y-%m-%dT%H:%M:%S.%fZ',
1744 '%Y-%m-%dT%H:%M:%S.%f0Z',
1745 '%Y-%m-%dT%H:%M:%S',
1746 '%Y-%m-%dT%H:%M:%S.%f',
1748 '%b %d %Y at %H:%M',
1749 '%b %d %Y at %H:%M:%S',
1750 '%B %d %Y at %H:%M',
1751 '%B %d %Y at %H:%M:%S',
1754 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
1755 DATE_FORMATS_DAY_FIRST.extend([
1761 '%d/%m/%Y %H:%M:%S',
1764 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
1765 DATE_FORMATS_MONTH_FIRST.extend([
1770 '%m/%d/%Y %H:%M:%S',
1773 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1774 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
1777 def preferredencoding():
1778 """Get preferred encoding.
1780 Returns the best encoding scheme for the system, based on
1781 locale.getpreferredencoding() and some further tweaks.
1784 pref = locale.getpreferredencoding()
1792 def write_json_file(obj, fn):
1793 """ Encode obj as JSON and write it to fn, atomically if possible """
1795 fn = encodeFilename(fn)
1796 if sys.version_info < (3, 0) and sys.platform != 'win32':
1797 encoding = get_filesystem_encoding()
1798 # os.path.basename returns a bytes object, but NamedTemporaryFile
1799 # will fail if the filename contains non ascii characters unless we
1800 # use a unicode object
1801 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1802 # the same for os.path.dirname
1803 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1805 path_basename = os.path.basename
1806 path_dirname = os.path.dirname
1810 'prefix': path_basename(fn) + '.',
1811 'dir': path_dirname(fn),
1815 # In Python 2.x, json.dump expects a bytestream.
1816 # In Python 3.x, it writes to a character stream
1817 if sys.version_info < (3, 0):
1822 'encoding': 'utf-8',
1825 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1830 if sys.platform == 'win32':
1831 # Need to remove existing file on Windows, else os.rename raises
1832 # WindowsError or FileExistsError.
1837 os.rename(tf.name, fn)
1846 if sys.version_info >= (2, 7):
1847 def find_xpath_attr(node, xpath, key, val=None):
1848 """ Find the xpath xpath[@key=val] """
1849 assert re.match(r'^[a-zA-Z_-]+$', key)
1850 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1851 return node.find(expr)
1853 def find_xpath_attr(node, xpath, key, val=None):
1854 for f in node.findall(compat_xpath(xpath)):
1855 if key not in f.attrib:
1857 if val is None or f.attrib.get(key) == val:
1861 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1862 # the namespace parameter
1865 def xpath_with_ns(path, ns_map):
1866 components = [c.split(':') for c in path.split('/')]
1868 for c in components:
1870 replaced.append(c[0])
1873 replaced.append('{%s}%s' % (ns_map[ns], tag))
1874 return '/'.join(replaced)
1877 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
1878 def _find_xpath(xpath):
1879 return node.find(compat_xpath(xpath))
1881 if isinstance(xpath, (str, compat_str)):
1882 n = _find_xpath(xpath)
1890 if default is not NO_DEFAULT:
1893 name = xpath if name is None else name
1894 raise ExtractorError('Could not find XML element %s' % name)
1900 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
1901 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
1902 if n is None or n == default:
1905 if default is not NO_DEFAULT:
1908 name = xpath if name is None else name
1909 raise ExtractorError('Could not find XML element\'s text %s' % name)
1915 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
1916 n = find_xpath_attr(node, xpath, key)
1918 if default is not NO_DEFAULT:
1921 name = '%s[@%s]' % (xpath, key) if name is None else name
1922 raise ExtractorError('Could not find XML attribute %s' % name)
1925 return n.attrib[key]
1928 def get_element_by_id(id, html):
1929 """Return the content of the tag with the specified ID in the passed HTML document"""
1930 return get_element_by_attribute('id', id, html)
1933 def get_element_by_class(class_name, html):
1934 """Return the content of the first tag with the specified class in the passed HTML document"""
1935 retval = get_elements_by_class(class_name, html)
1936 return retval[0] if retval else None
1939 def get_element_by_attribute(attribute, value, html, escape_value=True):
1940 retval = get_elements_by_attribute(attribute, value, html, escape_value)
1941 return retval[0] if retval else None
1944 def get_elements_by_class(class_name, html):
1945 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1946 return get_elements_by_attribute(
1947 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1948 html, escape_value=False)
1951 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1952 """Return the content of the tag with the specified attribute in the passed HTML document"""
1954 value = re.escape(value) if escape_value else value
1957 for m in re.finditer(r'''(?xs)
1959 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
1961 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
1965 ''' % (re.escape(attribute), value), html):
1966 res = m.group('content')
1968 if res.startswith('"') or res.startswith("'"):
1971 retlist.append(unescapeHTML(res))
1976 class HTMLAttributeParser(compat_HTMLParser):
1977 """Trivial HTML parser to gather the attributes for a single element"""
1980 compat_HTMLParser.__init__(self)
1982 def handle_starttag(self, tag, attrs):
1983 self.attrs = dict(attrs)
1986 def extract_attributes(html_element):
1987 """Given a string for an HTML element such as
1989 a="foo" B="bar" c="&98;az" d=boz
1990 empty= noval entity="&"
1993 Decode and return a dictionary of attributes.
1995 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
1996 'empty': '', 'noval': None, 'entity': '&',
1997 'sq': '"', 'dq': '\''
1999 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2000 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2002 parser = HTMLAttributeParser()
2004 parser.feed(html_element)
2006 # Older Python may throw HTMLParseError in case of malformed HTML
2007 except compat_HTMLParseError:
2012 def clean_html(html):
2013 """Clean an HTML snippet into a readable string"""
2015 if html is None: # Convenience for sanitizing descriptions etc.
2019 html = html.replace('\n', ' ')
2020 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2021 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2023 html = re.sub('<.*?>', '', html)
2024 # Replace html entities
2025 html = unescapeHTML(html)
2029 def sanitize_open(filename, open_mode):
2030 """Try to open the given filename, and slightly tweak it if this fails.
2032 Attempts to open the given filename. If this fails, it tries to change
2033 the filename slightly, step by step, until it's either able to open it
2034 or it fails and raises a final exception, like the standard open()
2037 It returns the tuple (stream, definitive_file_name).
2041 if sys.platform == 'win32':
2043 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2044 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2045 stream = open(encodeFilename(filename), open_mode)
2046 return (stream, filename)
2047 except (IOError, OSError) as err:
2048 if err.errno in (errno.EACCES,):
2051 # In case of error, try to remove win32 forbidden chars
2052 alt_filename = sanitize_path(filename)
2053 if alt_filename == filename:
2056 # An exception here should be caught in the caller
2057 stream = open(encodeFilename(alt_filename), open_mode)
2058 return (stream, alt_filename)
2061 def timeconvert(timestr):
2062 """Convert RFC 2822 defined time string into system timestamp"""
2064 timetuple = email.utils.parsedate_tz(timestr)
2065 if timetuple is not None:
2066 timestamp = email.utils.mktime_tz(timetuple)
2070 def sanitize_filename(s, restricted=False, is_id=False):
2071 """Sanitizes a string so it could be used as part of a filename.
2072 If restricted is set, use a stricter subset of allowed characters.
2073 Set is_id if this is not an arbitrary string, but an ID that should be kept
2076 def replace_insane(char):
2077 if restricted and char in ACCENT_CHARS:
2078 return ACCENT_CHARS[char]
2079 if char == '?' or ord(char) < 32 or ord(char) == 127:
2082 return '' if restricted else '\''
2084 return '_-' if restricted else ' -'
2085 elif char in '\\/|*<>':
2087 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
2089 if restricted and ord(char) > 127:
2094 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
2095 result = ''.join(map(replace_insane, s))
2097 while '__' in result:
2098 result = result.replace('__', '_')
2099 result = result.strip('_')
2100 # Common case of "Foreign band name - English song title"
2101 if restricted and result.startswith('-_'):
2103 if result.startswith('-'):
2104 result = '_' + result[len('-'):]
2105 result = result.lstrip('.')
2111 def sanitize_path(s):
2112 """Sanitizes and normalizes path on Windows"""
2113 if sys.platform != 'win32':
2115 drive_or_unc, _ = os.path.splitdrive(s)
2116 if sys.version_info < (2, 7) and not drive_or_unc:
2117 drive_or_unc, _ = os.path.splitunc(s)
2118 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
2122 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
2123 for path_part in norm_path]
2125 sanitized_path.insert(0, drive_or_unc + os.path.sep)
2126 return os.path.join(*sanitized_path)
2129 def sanitize_url(url):
2130 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2131 # the number of unwanted failures due to missing protocol
2132 if url.startswith('//'):
2133 return 'http:%s' % url
2134 # Fix some common typos seen so far
2136 # https://github.com/ytdl-org/youtube-dl/issues/15649
2137 (r'^httpss://', r'https://'),
2138 # https://bx1.be/lives/direct-tv/
2139 (r'^rmtp([es]?)://', r'rtmp\1://'),
2141 for mistake, fixup in COMMON_TYPOS:
2142 if re.match(mistake, url):
2143 return re.sub(mistake, fixup, url)
2147 def sanitized_Request(url, *args, **kwargs):
2148 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
2152 """Expand shell variables and ~"""
2153 return os.path.expandvars(compat_expanduser(s))
2156 def orderedSet(iterable):
2157 """ Remove all duplicates from the input iterable """
2165 def _htmlentity_transform(entity_with_semicolon):
2166 """Transforms an HTML entity to a character."""
2167 entity = entity_with_semicolon[:-1]
2169 # Known non-numeric HTML entity
2170 if entity in compat_html_entities.name2codepoint:
2171 return compat_chr(compat_html_entities.name2codepoint[entity])
2173 # TODO: HTML5 allows entities without a semicolon. For example,
2174 # 'Éric' should be decoded as 'Éric'.
2175 if entity_with_semicolon in compat_html_entities_html5:
2176 return compat_html_entities_html5[entity_with_semicolon]
2178 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
2179 if mobj is not None:
2180 numstr = mobj.group(1)
2181 if numstr.startswith('x'):
2183 numstr = '0%s' % numstr
2186 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2188 return compat_chr(int(numstr, base))
2192 # Unknown entity in name, return its literal representation
2193 return '&%s;' % entity
2196 def unescapeHTML(s):
2199 assert type(s) == compat_str
2202 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
2205 def get_subprocess_encoding():
2206 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
2207 # For subprocess calls, encode with locale encoding
2208 # Refer to http://stackoverflow.com/a/9951851/35070
2209 encoding = preferredencoding()
2211 encoding = sys.getfilesystemencoding()
2212 if encoding is None:
2217 def encodeFilename(s, for_subprocess=False):
2219 @param s The name of the file
2222 assert type(s) == compat_str
2224 # Python 3 has a Unicode API
2225 if sys.version_info >= (3, 0):
2228 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2229 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2230 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2231 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
2234 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2235 if sys.platform.startswith('java'):
2238 return s.encode(get_subprocess_encoding(), 'ignore')
2241 def decodeFilename(b, for_subprocess=False):
2243 if sys.version_info >= (3, 0):
2246 if not isinstance(b, bytes):
2249 return b.decode(get_subprocess_encoding(), 'ignore')
2252 def encodeArgument(s):
2253 if not isinstance(s, compat_str):
2254 # Legacy code that uses byte strings
2255 # Uncomment the following line after fixing all post processors
2256 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2257 s = s.decode('ascii')
2258 return encodeFilename(s, True)
2261 def decodeArgument(b):
2262 return decodeFilename(b, True)
2265 def decodeOption(optval):
2268 if isinstance(optval, bytes):
2269 optval = optval.decode(preferredencoding())
2271 assert isinstance(optval, compat_str)
2275 def formatSeconds(secs):
2277 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
2279 return '%d:%02d' % (secs // 60, secs % 60)
2284 def make_HTTPS_handler(params, **kwargs):
2285 opts_no_check_certificate = params.get('nocheckcertificate', False)
2286 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
2287 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
2288 if opts_no_check_certificate:
2289 context.check_hostname = False
2290 context.verify_mode = ssl.CERT_NONE
2292 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
2295 # (create_default_context present but HTTPSHandler has no context=)
2298 if sys.version_info < (3, 2):
2299 return YoutubeDLHTTPSHandler(params, **kwargs)
2300 else: # Python < 3.4
2301 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
2302 context.verify_mode = (ssl.CERT_NONE
2303 if opts_no_check_certificate
2304 else ssl.CERT_REQUIRED)
2305 context.set_default_verify_paths()
2306 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
2309 def bug_reports_message():
2310 if ytdl_is_updateable():
2311 update_cmd = 'type youtube-dl -U to update'
2313 update_cmd = 'see https://yt-dl.org/update on how to update'
2314 msg = '; please report this issue on https://yt-dl.org/bug .'
2315 msg += ' Make sure you are using the latest version; %s.' % update_cmd
2316 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
2320 class YoutubeDLError(Exception):
2321 """Base exception for YoutubeDL errors."""
2325 class ExtractorError(YoutubeDLError):
2326 """Error during info extraction."""
2328 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
2329 """ tb, if given, is the original traceback (so that it can be printed out).
2330 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
2333 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
2335 if video_id is not None:
2336 msg = video_id + ': ' + msg
2338 msg += ' (caused by %r)' % cause
2340 msg += bug_reports_message()
2341 super(ExtractorError, self).__init__(msg)
2344 self.exc_info = sys.exc_info() # preserve original exception
2346 self.video_id = video_id
2348 def format_traceback(self):
2349 if self.traceback is None:
2351 return ''.join(traceback.format_tb(self.traceback))
2354 class UnsupportedError(ExtractorError):
2355 def __init__(self, url):
2356 super(UnsupportedError, self).__init__(
2357 'Unsupported URL: %s' % url, expected=True)
2361 class RegexNotFoundError(ExtractorError):
2362 """Error when a regex didn't match"""
2366 class GeoRestrictedError(ExtractorError):
2367 """Geographic restriction Error exception.
2369 This exception may be thrown when a video is not available from your
2370 geographic location due to geographic restrictions imposed by a website.
2372 def __init__(self, msg, countries=None):
2373 super(GeoRestrictedError, self).__init__(msg, expected=True)
2375 self.countries = countries
2378 class DownloadError(YoutubeDLError):
2379 """Download Error exception.
2381 This exception may be thrown by FileDownloader objects if they are not
2382 configured to continue on errors. They will contain the appropriate
2386 def __init__(self, msg, exc_info=None):
2387 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2388 super(DownloadError, self).__init__(msg)
2389 self.exc_info = exc_info
2392 class SameFileError(YoutubeDLError):
2393 """Same File exception.
2395 This exception will be thrown by FileDownloader objects if they detect
2396 multiple files would have to be downloaded to the same file on disk.
2401 class PostProcessingError(YoutubeDLError):
2402 """Post Processing exception.
2404 This exception may be raised by PostProcessor's .run() method to
2405 indicate an error in the postprocessing task.
2408 def __init__(self, msg):
2409 super(PostProcessingError, self).__init__(msg)
2413 class MaxDownloadsReached(YoutubeDLError):
2414 """ --max-downloads limit has been reached. """
2418 class UnavailableVideoError(YoutubeDLError):
2419 """Unavailable Format exception.
2421 This exception will be thrown when a video is requested
2422 in a format that is not available for that video.
2427 class ContentTooShortError(YoutubeDLError):
2428 """Content Too Short exception.
2430 This exception may be raised by FileDownloader objects when a file they
2431 download is too small for what the server announced first, indicating
2432 the connection was probably interrupted.
2435 def __init__(self, downloaded, expected):
2436 super(ContentTooShortError, self).__init__(
2437 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
2440 self.downloaded = downloaded
2441 self.expected = expected
2444 class XAttrMetadataError(YoutubeDLError):
2445 def __init__(self, code=None, msg='Unknown error'):
2446 super(XAttrMetadataError, self).__init__(msg)
2450 # Parsing code and msg
2451 if (self.code in (errno.ENOSPC, errno.EDQUOT)
2452 or 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
2453 self.reason = 'NO_SPACE'
2454 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
2455 self.reason = 'VALUE_TOO_LONG'
2457 self.reason = 'NOT_SUPPORTED'
2460 class XAttrUnavailableError(YoutubeDLError):
2464 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
2465 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2466 # expected HTTP responses to meet HTTP/1.0 or later (see also
2467 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2468 if sys.version_info < (3, 0):
2469 kwargs['strict'] = True
2470 hc = http_class(*args, **compat_kwargs(kwargs))
2471 source_address = ydl_handler._params.get('source_address')
2473 if source_address is not None:
2474 # This is to workaround _create_connection() from socket where it will try all
2475 # address data from getaddrinfo() including IPv6. This filters the result from
2476 # getaddrinfo() based on the source_address value.
2477 # This is based on the cpython socket.create_connection() function.
2478 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2479 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
2480 host, port = address
2482 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
2483 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
2484 ip_addrs = [addr for addr in addrs if addr[0] == af]
2485 if addrs and not ip_addrs:
2486 ip_version = 'v4' if af == socket.AF_INET else 'v6'
2488 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2489 % (ip_version, source_address[0]))
2490 for res in ip_addrs:
2491 af, socktype, proto, canonname, sa = res
2494 sock = socket.socket(af, socktype, proto)
2495 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
2496 sock.settimeout(timeout)
2497 sock.bind(source_address)
2499 err = None # Explicitly break reference cycle
2501 except socket.error as _:
2503 if sock is not None:
2508 raise socket.error('getaddrinfo returns an empty list')
2509 if hasattr(hc, '_create_connection'):
2510 hc._create_connection = _create_connection
2511 sa = (source_address, 0)
2512 if hasattr(hc, 'source_address'): # Python 2.7+
2513 hc.source_address = sa
2515 def _hc_connect(self, *args, **kwargs):
2516 sock = _create_connection(
2517 (self.host, self.port), self.timeout, sa)
2519 self.sock = ssl.wrap_socket(
2520 sock, self.key_file, self.cert_file,
2521 ssl_version=ssl.PROTOCOL_TLSv1)
2524 hc.connect = functools.partial(_hc_connect, hc)
2529 def handle_youtubedl_headers(headers):
2530 filtered_headers = headers
2532 if 'Youtubedl-no-compression' in filtered_headers:
2533 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
2534 del filtered_headers['Youtubedl-no-compression']
2536 return filtered_headers
2539 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
2540 """Handler for HTTP requests and responses.
2542 This class, when installed with an OpenerDirector, automatically adds
2543 the standard headers to every HTTP request and handles gzipped and
2544 deflated responses from web servers. If compression is to be avoided in
2545 a particular request, the original request in the program code only has
2546 to include the HTTP header "Youtubedl-no-compression", which will be
2547 removed before making the real request.
2549 Part of this code was copied from:
2551 http://techknack.net/python-urllib2-handlers/
2553 Andrew Rowls, the author of that code, agreed to release it to the
2557 def __init__(self, params, *args, **kwargs):
2558 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
2559 self._params = params
2561 def http_open(self, req):
2562 conn_class = compat_http_client.HTTPConnection
2564 socks_proxy = req.headers.get('Ytdl-socks-proxy')
2566 conn_class = make_socks_conn_class(conn_class, socks_proxy)
2567 del req.headers['Ytdl-socks-proxy']
2569 return self.do_open(functools.partial(
2570 _create_http_connection, self, conn_class, False),
2576 return zlib.decompress(data, -zlib.MAX_WBITS)
2578 return zlib.decompress(data)
2580 def http_request(self, req):
2581 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2582 # always respected by websites, some tend to give out URLs with non percent-encoded
2583 # non-ASCII characters (see telemb.py, ard.py [#3412])
2584 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2585 # To work around aforementioned issue we will replace request's original URL with
2586 # percent-encoded one
2587 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2588 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2589 url = req.get_full_url()
2590 url_escaped = escape_url(url)
2592 # Substitute URL if any change after escaping
2593 if url != url_escaped:
2594 req = update_Request(req, url=url_escaped)
2596 for h, v in std_headers.items():
2597 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2598 # The dict keys are capitalized because of this bug by urllib
2599 if h.capitalize() not in req.headers:
2600 req.add_header(h, v)
2602 req.headers = handle_youtubedl_headers(req.headers)
2604 if sys.version_info < (2, 7) and '#' in req.get_full_url():
2605 # Python 2.6 is brain-dead when it comes to fragments
2606 req._Request__original = req._Request__original.partition('#')[0]
2607 req._Request__r_type = req._Request__r_type.partition('#')[0]
2611 def http_response(self, req, resp):
2614 if resp.headers.get('Content-encoding', '') == 'gzip':
2615 content = resp.read()
2616 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
2618 uncompressed = io.BytesIO(gz.read())
2619 except IOError as original_ioerror:
2620 # There may be junk add the end of the file
2621 # See http://stackoverflow.com/q/4928560/35070 for details
2622 for i in range(1, 1024):
2624 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
2625 uncompressed = io.BytesIO(gz.read())
2630 raise original_ioerror
2631 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
2632 resp.msg = old_resp.msg
2633 del resp.headers['Content-encoding']
2635 if resp.headers.get('Content-encoding', '') == 'deflate':
2636 gz = io.BytesIO(self.deflate(resp.read()))
2637 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
2638 resp.msg = old_resp.msg
2639 del resp.headers['Content-encoding']
2640 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2641 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2642 if 300 <= resp.code < 400:
2643 location = resp.headers.get('Location')
2645 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2646 if sys.version_info >= (3, 0):
2647 location = location.encode('iso-8859-1').decode('utf-8')
2649 location = location.decode('utf-8')
2650 location_escaped = escape_url(location)
2651 if location != location_escaped:
2652 del resp.headers['Location']
2653 if sys.version_info < (3, 0):
2654 location_escaped = location_escaped.encode('utf-8')
2655 resp.headers['Location'] = location_escaped
2658 https_request = http_request
2659 https_response = http_response
2662 def make_socks_conn_class(base_class, socks_proxy):
2663 assert issubclass(base_class, (
2664 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
2666 url_components = compat_urlparse.urlparse(socks_proxy)
2667 if url_components.scheme.lower() == 'socks5':
2668 socks_type = ProxyType.SOCKS5
2669 elif url_components.scheme.lower() in ('socks', 'socks4'):
2670 socks_type = ProxyType.SOCKS4
2671 elif url_components.scheme.lower() == 'socks4a':
2672 socks_type = ProxyType.SOCKS4A
2674 def unquote_if_non_empty(s):
2677 return compat_urllib_parse_unquote_plus(s)
2681 url_components.hostname, url_components.port or 1080,
2683 unquote_if_non_empty(url_components.username),
2684 unquote_if_non_empty(url_components.password),
2687 class SocksConnection(base_class):
2689 self.sock = sockssocket()
2690 self.sock.setproxy(*proxy_args)
2691 if type(self.timeout) in (int, float):
2692 self.sock.settimeout(self.timeout)
2693 self.sock.connect((self.host, self.port))
2695 if isinstance(self, compat_http_client.HTTPSConnection):
2696 if hasattr(self, '_context'): # Python > 2.6
2697 self.sock = self._context.wrap_socket(
2698 self.sock, server_hostname=self.host)
2700 self.sock = ssl.wrap_socket(self.sock)
2702 return SocksConnection
2705 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
2706 def __init__(self, params, https_conn_class=None, *args, **kwargs):
2707 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
2708 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
2709 self._params = params
2711 def https_open(self, req):
2713 conn_class = self._https_conn_class
2715 if hasattr(self, '_context'): # python > 2.6
2716 kwargs['context'] = self._context
2717 if hasattr(self, '_check_hostname'): # python 3.x
2718 kwargs['check_hostname'] = self._check_hostname
2720 socks_proxy = req.headers.get('Ytdl-socks-proxy')
2722 conn_class = make_socks_conn_class(conn_class, socks_proxy)
2723 del req.headers['Ytdl-socks-proxy']
2725 return self.do_open(functools.partial(
2726 _create_http_connection, self, conn_class, True),
2730 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
2731 _HTTPONLY_PREFIX = '#HttpOnly_'
2733 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2734 # Store session cookies with `expires` set to 0 instead of an empty
2737 if cookie.expires is None:
2739 compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
2741 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
2742 """Load cookies from a file."""
2743 if filename is None:
2744 if self.filename is not None:
2745 filename = self.filename
2747 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
2750 with open(filename) as f:
2752 if line.startswith(self._HTTPONLY_PREFIX):
2753 line = line[len(self._HTTPONLY_PREFIX):]
2754 cf.write(compat_str(line))
2756 self._really_load(cf, filename, ignore_discard, ignore_expires)
2757 # Session cookies are denoted by either `expires` field set to
2758 # an empty string or 0. MozillaCookieJar only recognizes the former
2759 # (see [1]). So we need force the latter to be recognized as session
2760 # cookies on our own.
2761 # Session cookies may be important for cookies-based authentication,
2762 # e.g. usually, when user does not check 'Remember me' check box while
2763 # logging in on a site, some important cookies are stored as session
2764 # cookies so that not recognizing them will result in failed login.
2765 # 1. https://bugs.python.org/issue17164
2767 # Treat `expires=0` cookies as session cookies
2768 if cookie.expires == 0:
2769 cookie.expires = None
2770 cookie.discard = True
2773 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
2774 def __init__(self, cookiejar=None):
2775 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
2777 def http_response(self, request, response):
2778 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2779 # characters in Set-Cookie HTTP header of last response (see
2780 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2781 # In order to at least prevent crashing we will percent encode Set-Cookie
2782 # header before HTTPCookieProcessor starts processing it.
2783 # if sys.version_info < (3, 0) and response.headers:
2784 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2785 # set_cookie = response.headers.get(set_cookie_header)
2787 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2788 # if set_cookie != set_cookie_escaped:
2789 # del response.headers[set_cookie_header]
2790 # response.headers[set_cookie_header] = set_cookie_escaped
2791 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
2793 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
2794 https_response = http_response
2797 def extract_timezone(date_str):
2799 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2802 timezone = datetime.timedelta()
2804 date_str = date_str[:-len(m.group('tz'))]
2805 if not m.group('sign'):
2806 timezone = datetime.timedelta()
2808 sign = 1 if m.group('sign') == '+' else -1
2809 timezone = datetime.timedelta(
2810 hours=sign * int(m.group('hours')),
2811 minutes=sign * int(m.group('minutes')))
2812 return timezone, date_str
2815 def parse_iso8601(date_str, delimiter='T', timezone=None):
2816 """ Return a UNIX timestamp from the given date """
2818 if date_str is None:
2821 date_str = re.sub(r'\.[0-9]+', '', date_str)
2823 if timezone is None:
2824 timezone, date_str = extract_timezone(date_str)
2827 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
2828 dt = datetime.datetime.strptime(date_str, date_format) - timezone
2829 return calendar.timegm(dt.timetuple())
2834 def date_formats(day_first=True):
2835 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
2838 def unified_strdate(date_str, day_first=True):
2839 """Return a string with the date in the format YYYYMMDD"""
2841 if date_str is None:
2845 date_str = date_str.replace(',', ' ')
2846 # Remove AM/PM + timezone
2847 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
2848 _, date_str = extract_timezone(date_str)
2850 for expression in date_formats(day_first):
2852 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
2855 if upload_date is None:
2856 timetuple = email.utils.parsedate_tz(date_str)
2859 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
2862 if upload_date is not None:
2863 return compat_str(upload_date)
2866 def unified_timestamp(date_str, day_first=True):
2867 if date_str is None:
2870 date_str = re.sub(r'[,|]', '', date_str)
2872 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
2873 timezone, date_str = extract_timezone(date_str)
2875 # Remove AM/PM + timezone
2876 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
2878 # Remove unrecognized timezones from ISO 8601 alike timestamps
2879 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
2881 date_str = date_str[:-len(m.group('tz'))]
2883 # Python only supports microseconds, so remove nanoseconds
2884 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
2886 date_str = m.group(1)
2888 for expression in date_formats(day_first):
2890 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
2891 return calendar.timegm(dt.timetuple())
2894 timetuple = email.utils.parsedate_tz(date_str)
2896 return calendar.timegm(timetuple) + pm_delta * 3600
2899 def determine_ext(url, default_ext='unknown_video'):
2900 if url is None or '.' not in url:
2902 guess = url.partition('?')[0].rpartition('.')[2]
2903 if re.match(r'^[A-Za-z0-9]+$', guess):
2905 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
2906 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
2907 return guess.rstrip('/')
2912 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
2913 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
2916 def date_from_str(date_str):
2918 Return a datetime object from a string in the format YYYYMMDD or
2919 (now|today)[+-][0-9](day|week|month|year)(s)?"""
2920 today = datetime.date.today()
2921 if date_str in ('now', 'today'):
2923 if date_str == 'yesterday':
2924 return today - datetime.timedelta(days=1)
2925 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
2926 if match is not None:
2927 sign = match.group('sign')
2928 time = int(match.group('time'))
2931 unit = match.group('unit')
2932 # A bad approximation?
2936 elif unit == 'year':
2940 delta = datetime.timedelta(**{unit: time})
2941 return today + delta
2942 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
2945 def hyphenate_date(date_str):
2947 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
2948 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
2949 if match is not None:
2950 return '-'.join(match.groups())
2955 class DateRange(object):
2956 """Represents a time interval between two dates"""
2958 def __init__(self, start=None, end=None):
2959 """start and end must be strings in the format accepted by date"""
2960 if start is not None:
2961 self.start = date_from_str(start)
2963 self.start = datetime.datetime.min.date()
2965 self.end = date_from_str(end)
2967 self.end = datetime.datetime.max.date()
2968 if self.start > self.end:
2969 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
2973 """Returns a range that only contains the given day"""
2974 return cls(day, day)
2976 def __contains__(self, date):
2977 """Check if the date is in the range"""
2978 if not isinstance(date, datetime.date):
2979 date = date_from_str(date)
2980 return self.start <= date <= self.end
2983 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
2986 def platform_name():
2987 """ Returns the platform name as a compat_str """
2988 res = platform.platform()
2989 if isinstance(res, bytes):
2990 res = res.decode(preferredencoding())
2992 assert isinstance(res, compat_str)
2996 def _windows_write_string(s, out):
2997 """ Returns True if the string was written using special methods,
2998 False if it has yet to be written out."""
2999 # Adapted from http://stackoverflow.com/a/3259271/35070
3002 import ctypes.wintypes
3010 fileno = out.fileno()
3011 except AttributeError:
3012 # If the output stream doesn't have a fileno, it's virtual
3014 except io.UnsupportedOperation:
3015 # Some strange Windows pseudo files?
3017 if fileno not in WIN_OUTPUT_IDS:
3020 GetStdHandle = compat_ctypes_WINFUNCTYPE(
3021 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
3022 ('GetStdHandle', ctypes.windll.kernel32))
3023 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
3025 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
3026 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
3027 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
3028 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
3029 written = ctypes.wintypes.DWORD(0)
3031 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
3032 FILE_TYPE_CHAR = 0x0002
3033 FILE_TYPE_REMOTE = 0x8000
3034 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
3035 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
3036 ctypes.POINTER(ctypes.wintypes.DWORD))(
3037 ('GetConsoleMode', ctypes.windll.kernel32))
3038 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
3040 def not_a_console(handle):
3041 if handle == INVALID_HANDLE_VALUE or handle is None:
3043 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
3044 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
3046 if not_a_console(h):
3049 def next_nonbmp_pos(s):
3051 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
3052 except StopIteration:
3056 count = min(next_nonbmp_pos(s), 1024)
3058 ret = WriteConsoleW(
3059 h, s, count if count else 2, ctypes.byref(written), None)
3061 raise OSError('Failed to write string')
3062 if not count: # We just wrote a non-BMP character
3063 assert written.value == 2
3066 assert written.value > 0
3067 s = s[written.value:]
3071 def write_string(s, out=None, encoding=None):
3074 assert type(s) == compat_str
3076 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
3077 if _windows_write_string(s, out):
3080 if ('b' in getattr(out, 'mode', '')
3081 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
3082 byt = s.encode(encoding or preferredencoding(), 'ignore')
3084 elif hasattr(out, 'buffer'):
3085 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
3086 byt = s.encode(enc, 'ignore')
3087 out.buffer.write(byt)
3093 def bytes_to_intlist(bs):
3096 if isinstance(bs[0], int): # Python 3
3099 return [ord(c) for c in bs]
3102 def intlist_to_bytes(xs):
3105 return compat_struct_pack('%dB' % len(xs), *xs)
3108 # Cross-platform file locking
3109 if sys.platform == 'win32':
3110 import ctypes.wintypes
3113 class OVERLAPPED(ctypes.Structure):
3115 ('Internal', ctypes.wintypes.LPVOID),
3116 ('InternalHigh', ctypes.wintypes.LPVOID),
3117 ('Offset', ctypes.wintypes.DWORD),
3118 ('OffsetHigh', ctypes.wintypes.DWORD),
3119 ('hEvent', ctypes.wintypes.HANDLE),
3122 kernel32 = ctypes.windll.kernel32
3123 LockFileEx = kernel32.LockFileEx
3124 LockFileEx.argtypes = [
3125 ctypes.wintypes.HANDLE, # hFile
3126 ctypes.wintypes.DWORD, # dwFlags
3127 ctypes.wintypes.DWORD, # dwReserved
3128 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
3129 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
3130 ctypes.POINTER(OVERLAPPED) # Overlapped
3132 LockFileEx.restype = ctypes.wintypes.BOOL
3133 UnlockFileEx = kernel32.UnlockFileEx
3134 UnlockFileEx.argtypes = [
3135 ctypes.wintypes.HANDLE, # hFile
3136 ctypes.wintypes.DWORD, # dwReserved
3137 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
3138 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
3139 ctypes.POINTER(OVERLAPPED) # Overlapped
3141 UnlockFileEx.restype = ctypes.wintypes.BOOL
3142 whole_low = 0xffffffff
3143 whole_high = 0x7fffffff
3145 def _lock_file(f, exclusive):
3146 overlapped = OVERLAPPED()
3147 overlapped.Offset = 0
3148 overlapped.OffsetHigh = 0
3149 overlapped.hEvent = 0
3150 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
3151 handle = msvcrt.get_osfhandle(f.fileno())
3152 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
3153 whole_low, whole_high, f._lock_file_overlapped_p):
3154 raise OSError('Locking file failed: %r' % ctypes.FormatError())
3156 def _unlock_file(f):
3157 assert f._lock_file_overlapped_p
3158 handle = msvcrt.get_osfhandle(f.fileno())
3159 if not UnlockFileEx(handle, 0,
3160 whole_low, whole_high, f._lock_file_overlapped_p):
3161 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
3164 # Some platforms, such as Jython, is missing fcntl
3168 def _lock_file(f, exclusive):
3169 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
3171 def _unlock_file(f):
3172 fcntl.flock(f, fcntl.LOCK_UN)
3174 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
3176 def _lock_file(f, exclusive):
3177 raise IOError(UNSUPPORTED_MSG)
3179 def _unlock_file(f):
3180 raise IOError(UNSUPPORTED_MSG)
3183 class locked_file(object):
3184 def __init__(self, filename, mode, encoding=None):
3185 assert mode in ['r', 'a', 'w']
3186 self.f = io.open(filename, mode, encoding=encoding)
3189 def __enter__(self):
3190 exclusive = self.mode != 'r'
3192 _lock_file(self.f, exclusive)
3198 def __exit__(self, etype, value, traceback):
3200 _unlock_file(self.f)
3207 def write(self, *args):
3208 return self.f.write(*args)
3210 def read(self, *args):
3211 return self.f.read(*args)
3214 def get_filesystem_encoding():
3215 encoding = sys.getfilesystemencoding()
3216 return encoding if encoding is not None else 'utf-8'
3219 def shell_quote(args):
3221 encoding = get_filesystem_encoding()
3223 if isinstance(a, bytes):
3224 # We may get a filename encoded with 'encodeFilename'
3225 a = a.decode(encoding)
3226 quoted_args.append(compat_shlex_quote(a))
3227 return ' '.join(quoted_args)
3230 def smuggle_url(url, data):
3231 """ Pass additional data in a URL for internal use. """
3233 url, idata = unsmuggle_url(url, {})
3235 sdata = compat_urllib_parse_urlencode(
3236 {'__youtubedl_smuggle': json.dumps(data)})
3237 return url + '#' + sdata
3240 def unsmuggle_url(smug_url, default=None):
3241 if '#__youtubedl_smuggle' not in smug_url:
3242 return smug_url, default
3243 url, _, sdata = smug_url.rpartition('#')
3244 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
3245 data = json.loads(jsond)
3249 def format_bytes(bytes):
3252 if type(bytes) is str:
3253 bytes = float(bytes)
3257 exponent = int(math.log(bytes, 1024.0))
3258 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
3259 converted = float(bytes) / float(1024 ** exponent)
3260 return '%.2f%s' % (converted, suffix)
3263 def lookup_unit_table(unit_table, s):
3264 units_re = '|'.join(re.escape(u) for u in unit_table)
3266 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
3269 num_str = m.group('num').replace(',', '.')
3270 mult = unit_table[m.group('unit')]
3271 return int(float(num_str) * mult)
3274 def parse_filesize(s):
3278 # The lower-case forms are of course incorrect and unofficial,
3279 # but we support those too
3296 'megabytes': 1000 ** 2,
3297 'mebibytes': 1024 ** 2,
3303 'gigabytes': 1000 ** 3,
3304 'gibibytes': 1024 ** 3,
3310 'terabytes': 1000 ** 4,
3311 'tebibytes': 1024 ** 4,
3317 'petabytes': 1000 ** 5,
3318 'pebibytes': 1024 ** 5,
3324 'exabytes': 1000 ** 6,
3325 'exbibytes': 1024 ** 6,
3331 'zettabytes': 1000 ** 7,
3332 'zebibytes': 1024 ** 7,
3338 'yottabytes': 1000 ** 8,
3339 'yobibytes': 1024 ** 8,
3342 return lookup_unit_table(_UNIT_TABLE, s)
3351 if re.match(r'^[\d,.]+$', s):
3352 return str_to_int(s)
3363 return lookup_unit_table(_UNIT_TABLE, s)
3366 def parse_resolution(s):
3370 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
3373 'width': int(mobj.group('w')),
3374 'height': int(mobj.group('h')),
3377 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
3379 return {'height': int(mobj.group(1))}
3381 mobj = re.search(r'\b([48])[kK]\b', s)
3383 return {'height': int(mobj.group(1)) * 540}
3388 def parse_bitrate(s):
3389 if not isinstance(s, compat_str):
3391 mobj = re.search(r'\b(\d+)\s*kbps', s)
3393 return int(mobj.group(1))
3396 def month_by_name(name, lang='en'):
3397 """ Return the number of a month by (locale-independently) English name """
3399 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
3402 return month_names.index(name) + 1
3407 def month_by_abbreviation(abbrev):
3408 """ Return the number of a month by (locale-independently) English
3412 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
3417 def fix_xml_ampersands(xml_str):
3418 """Replace all the '&' by '&' in XML"""
3420 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3425 def setproctitle(title):
3426 assert isinstance(title, compat_str)
3428 # ctypes in Jython is not complete
3429 # http://bugs.jython.org/issue2148
3430 if sys.platform.startswith('java'):
3434 libc = ctypes.cdll.LoadLibrary('libc.so.6')
3438 # LoadLibrary in Windows Python 2.7.13 only expects
3439 # a bytestring, but since unicode_literals turns
3440 # every string into a unicode string, it fails.
3442 title_bytes = title.encode('utf-8')
3443 buf = ctypes.create_string_buffer(len(title_bytes))
3444 buf.value = title_bytes
3446 libc.prctl(15, buf, 0, 0, 0)
3447 except AttributeError:
3448 return # Strange libc, just skip this
3451 def remove_start(s, start):
3452 return s[len(start):] if s is not None and s.startswith(start) else s
3455 def remove_end(s, end):
3456 return s[:-len(end)] if s is not None and s.endswith(end) else s
3459 def remove_quotes(s):
3460 if s is None or len(s) < 2:
3462 for quote in ('"', "'", ):
3463 if s[0] == quote and s[-1] == quote:
3468 def url_basename(url):
3469 path = compat_urlparse.urlparse(url).path
3470 return path.strip('/').split('/')[-1]
3474 return re.match(r'https?://[^?#&]+/', url).group()
3477 def urljoin(base, path):
3478 if isinstance(path, bytes):
3479 path = path.decode('utf-8')
3480 if not isinstance(path, compat_str) or not path:
3482 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
3484 if isinstance(base, bytes):
3485 base = base.decode('utf-8')
3486 if not isinstance(base, compat_str) or not re.match(
3487 r'^(?:https?:)?//', base):
3489 return compat_urlparse.urljoin(base, path)
3492 class HEADRequest(compat_urllib_request.Request):
3493 def get_method(self):
3497 class PUTRequest(compat_urllib_request.Request):
3498 def get_method(self):
3502 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
3505 v = getattr(v, get_attr, None)
3511 return int(v) * invscale // scale
3512 except (ValueError, TypeError):
3516 def str_or_none(v, default=None):
3517 return default if v is None else compat_str(v)
3520 def str_to_int(int_str):
3521 """ A more relaxed version of int_or_none """
3522 if not isinstance(int_str, compat_str):
3524 int_str = re.sub(r'[,\.\+]', '', int_str)
3528 def float_or_none(v, scale=1, invscale=1, default=None):
3532 return float(v) * invscale / scale
3533 except (ValueError, TypeError):
3537 def bool_or_none(v, default=None):
3538 return v if isinstance(v, bool) else default
3541 def strip_or_none(v, default=None):
3542 return v.strip() if isinstance(v, compat_str) else default
3545 def url_or_none(url):
3546 if not url or not isinstance(url, compat_str):
3549 return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
3552 def parse_duration(s):
3553 if not isinstance(s, compat_basestring):
3558 days, hours, mins, secs, ms = [None] * 5
3559 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
3561 days, hours, mins, secs, ms = m.groups()
3566 [0-9]+\s*y(?:ears?)?\s*
3569 [0-9]+\s*m(?:onths?)?\s*
3572 [0-9]+\s*w(?:eeks?)?\s*
3575 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3579 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3582 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3585 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3588 days, hours, mins, secs, ms = m.groups()
3590 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
3592 hours, mins = m.groups()
3598 duration += float(secs)
3600 duration += float(mins) * 60
3602 duration += float(hours) * 60 * 60
3604 duration += float(days) * 24 * 60 * 60
3606 duration += float(ms)
3610 def prepend_extension(filename, ext, expected_real_ext=None):
3611 name, real_ext = os.path.splitext(filename)
3613 '{0}.{1}{2}'.format(name, ext, real_ext)
3614 if not expected_real_ext or real_ext[1:] == expected_real_ext
3615 else '{0}.{1}'.format(filename, ext))
3618 def replace_extension(filename, ext, expected_real_ext=None):
3619 name, real_ext = os.path.splitext(filename)
3620 return '{0}.{1}'.format(
3621 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
3625 def check_executable(exe, args=[]):
3626 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3627 args can be a list of arguments for a short output (like -version) """
3629 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
3635 def get_exe_version(exe, args=['--version'],
3636 version_re=None, unrecognized='present'):
3637 """ Returns the version of the specified executable,
3638 or False if the executable is not present """
3640 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3641 # SIGTTOU if youtube-dl is run in the background.
3642 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3643 out, _ = subprocess.Popen(
3644 [encodeArgument(exe)] + args,
3645 stdin=subprocess.PIPE,
3646 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
3649 if isinstance(out, bytes): # Python 2.x
3650 out = out.decode('ascii', 'ignore')
3651 return detect_exe_version(out, version_re, unrecognized)
3654 def detect_exe_version(output, version_re=None, unrecognized='present'):
3655 assert isinstance(output, compat_str)
3656 if version_re is None:
3657 version_re = r'version\s+([-0-9._a-zA-Z]+)'
3658 m = re.search(version_re, output)
3665 class PagedList(object):
3667 # This is only useful for tests
3668 return len(self.getslice())
3671 class OnDemandPagedList(PagedList):
3672 def __init__(self, pagefunc, pagesize, use_cache=True):
3673 self._pagefunc = pagefunc
3674 self._pagesize = pagesize
3675 self._use_cache = use_cache
3679 def getslice(self, start=0, end=None):
3681 for pagenum in itertools.count(start // self._pagesize):
3682 firstid = pagenum * self._pagesize
3683 nextfirstid = pagenum * self._pagesize + self._pagesize
3684 if start >= nextfirstid:
3689 page_results = self._cache.get(pagenum)
3690 if page_results is None:
3691 page_results = list(self._pagefunc(pagenum))
3693 self._cache[pagenum] = page_results
3696 start % self._pagesize
3697 if firstid <= start < nextfirstid
3701 ((end - 1) % self._pagesize) + 1
3702 if (end is not None and firstid <= end <= nextfirstid)
3705 if startv != 0 or endv is not None:
3706 page_results = page_results[startv:endv]
3707 res.extend(page_results)
3709 # A little optimization - if current page is not "full", ie. does
3710 # not contain page_size videos then we can assume that this page
3711 # is the last one - there are no more ids on further pages -
3712 # i.e. no need to query again.
3713 if len(page_results) + startv < self._pagesize:
3716 # If we got the whole page, but the next page is not interesting,
3717 # break out early as well
3718 if end == nextfirstid:
3723 class InAdvancePagedList(PagedList):
3724 def __init__(self, pagefunc, pagecount, pagesize):
3725 self._pagefunc = pagefunc
3726 self._pagecount = pagecount
3727 self._pagesize = pagesize
3729 def getslice(self, start=0, end=None):
3731 start_page = start // self._pagesize
3733 self._pagecount if end is None else (end // self._pagesize + 1))
3734 skip_elems = start - start_page * self._pagesize
3735 only_more = None if end is None else end - start
3736 for pagenum in range(start_page, end_page):
3737 page = list(self._pagefunc(pagenum))
3739 page = page[skip_elems:]
3741 if only_more is not None:
3742 if len(page) < only_more:
3743 only_more -= len(page)
3745 page = page[:only_more]
3752 def uppercase_escape(s):
3753 unicode_escape = codecs.getdecoder('unicode_escape')
3755 r'\\U[0-9a-fA-F]{8}',
3756 lambda m: unicode_escape(m.group(0))[0],
3760 def lowercase_escape(s):
3761 unicode_escape = codecs.getdecoder('unicode_escape')
3763 r'\\u[0-9a-fA-F]{4}',
3764 lambda m: unicode_escape(m.group(0))[0],
3768 def escape_rfc3986(s):
3769 """Escape non-ASCII characters as suggested by RFC 3986"""
3770 if sys.version_info < (3, 0) and isinstance(s, compat_str):
3771 s = s.encode('utf-8')
3772 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3775 def escape_url(url):
3776 """Escape URL as suggested by RFC 3986"""
3777 url_parsed = compat_urllib_parse_urlparse(url)
3778 return url_parsed._replace(
3779 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3780 path=escape_rfc3986(url_parsed.path),
3781 params=escape_rfc3986(url_parsed.params),
3782 query=escape_rfc3986(url_parsed.query),
3783 fragment=escape_rfc3986(url_parsed.fragment)
3787 def read_batch_urls(batch_fd):
3789 if not isinstance(url, compat_str):
3790 url = url.decode('utf-8', 'replace')
3791 BOM_UTF8 = '\xef\xbb\xbf'
3792 if url.startswith(BOM_UTF8):
3793 url = url[len(BOM_UTF8):]
3795 if url.startswith(('#', ';', ']')):
3799 with contextlib.closing(batch_fd) as fd:
3800 return [url for url in map(fixup, fd) if url]
3803 def urlencode_postdata(*args, **kargs):
3804 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
3807 def update_url_query(url, query):
3810 parsed_url = compat_urlparse.urlparse(url)
3811 qs = compat_parse_qs(parsed_url.query)
3813 return compat_urlparse.urlunparse(parsed_url._replace(
3814 query=compat_urllib_parse_urlencode(qs, True)))
3817 def update_Request(req, url=None, data=None, headers={}, query={}):
3818 req_headers = req.headers.copy()
3819 req_headers.update(headers)
3820 req_data = data or req.data
3821 req_url = update_url_query(url or req.get_full_url(), query)
3822 req_get_method = req.get_method()
3823 if req_get_method == 'HEAD':
3824 req_type = HEADRequest
3825 elif req_get_method == 'PUT':
3826 req_type = PUTRequest
3828 req_type = compat_urllib_request.Request
3830 req_url, data=req_data, headers=req_headers,
3831 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3832 if hasattr(req, 'timeout'):
3833 new_req.timeout = req.timeout
3837 def _multipart_encode_impl(data, boundary):
3838 content_type = 'multipart/form-data; boundary=%s' % boundary
3841 for k, v in data.items():
3842 out += b'--' + boundary.encode('ascii') + b'\r\n'
3843 if isinstance(k, compat_str):
3844 k = k.encode('utf-8')
3845 if isinstance(v, compat_str):
3846 v = v.encode('utf-8')
3847 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3848 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3849 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3850 if boundary.encode('ascii') in content:
3851 raise ValueError('Boundary overlaps with data')
3854 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3856 return out, content_type
3859 def multipart_encode(data, boundary=None):
3861 Encode a dict to RFC 7578-compliant form-data
3864 A dict where keys and values can be either Unicode or bytes-like
3867 If specified a Unicode object, it's used as the boundary. Otherwise
3868 a random boundary is generated.
3870 Reference: https://tools.ietf.org/html/rfc7578
3872 has_specified_boundary = boundary is not None
3875 if boundary is None:
3876 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3879 out, content_type = _multipart_encode_impl(data, boundary)
3882 if has_specified_boundary:
3886 return out, content_type
3889 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3890 if isinstance(key_or_keys, (list, tuple)):
3891 for key in key_or_keys:
3892 if key not in d or d[key] is None or skip_false_values and not d[key]:
3896 return d.get(key_or_keys, default)
3899 def try_get(src, getter, expected_type=None):
3900 if not isinstance(getter, (list, tuple)):
3905 except (AttributeError, KeyError, TypeError, IndexError):
3908 if expected_type is None or isinstance(v, expected_type):
3912 def merge_dicts(*dicts):
3914 for a_dict in dicts:
3915 for k, v in a_dict.items():
3919 or (isinstance(v, compat_str) and v
3920 and isinstance(merged[k], compat_str)
3921 and not merged[k])):
3926 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3927 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3939 TV_PARENTAL_GUIDELINES = {
3949 def parse_age_limit(s):
3951 return s if 0 <= s <= 21 else None
3952 if not isinstance(s, compat_basestring):
3954 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3956 return int(m.group('age'))
3958 return US_RATINGS[s]
3959 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3961 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3965 def strip_jsonp(code):
3968 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3969 (?:\s*&&\s*(?P=func_name))?
3970 \s*\(\s*(?P<callback_data>.*)\);?
3971 \s*?(?://[^\n]*)*$''',
3972 r'\g<callback_data>', code)
3975 def js_to_json(code):
3976 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
3977 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3979 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3980 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3985 if v in ('true', 'false', 'null'):
3987 elif v.startswith('/*') or v.startswith('//') or v == ',':
3990 if v[0] in ("'", '"'):
3991 v = re.sub(r'(?s)\\.|"', lambda m: {
3996 }.get(m.group(0), m.group(0)), v[1:-1])
3998 for regex, base in INTEGER_TABLE:
3999 im = re.match(regex, v)
4001 i = int(im.group(1), base)
4002 return '"%d":' % i if v.endswith(':') else '%d' % i
4006 return re.sub(r'''(?sx)
4007 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4008 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4009 {comment}|,(?={skip}[\]}}])|
4010 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4011 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4013 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
4016 def qualities(quality_ids):
4017 """ Get a numeric quality value out of a list of possible values """
4020 return quality_ids.index(qid)
4026 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
4029 def limit_length(s, length):
4030 """ Add ellipses to overly long strings """
4035 return s[:length - len(ELLIPSES)] + ELLIPSES
4039 def version_tuple(v):
4040 return tuple(int(e) for e in re.split(r'[-.]', v))
4043 def is_outdated_version(version, limit, assume_new=True):
4045 return not assume_new
4047 return version_tuple(version) < version_tuple(limit)
4049 return not assume_new
4052 def ytdl_is_updateable():
4053 """ Returns if youtube-dl can be updated with -U """
4054 from zipimport import zipimporter
4056 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
4059 def args_to_str(args):
4060 # Get a short string representation for a subprocess command
4061 return ' '.join(compat_shlex_quote(a) for a in args)
4064 def error_to_compat_str(err):
4066 # On python 2 error byte string must be decoded with proper
4067 # encoding rather than ascii
4068 if sys.version_info[0] < 3:
4069 err_str = err_str.decode(preferredencoding())
4073 def mimetype2ext(mt):
4079 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4080 # it's the most popular one
4081 'audio/mpeg': 'mp3',
4086 _, _, res = mt.rpartition('/')
4087 res = res.split(';')[0].strip().lower()
4091 'smptett+xml': 'tt',
4095 'x-mp4-fragmented': 'mp4',
4096 'x-ms-sami': 'sami',
4099 'x-mpegurl': 'm3u8',
4100 'vnd.apple.mpegurl': 'm3u8',
4104 'vnd.ms-sstr+xml': 'ism',
4110 def parse_codecs(codecs_str):
4111 # http://tools.ietf.org/html/rfc6381
4114 splited_codecs = list(filter(None, map(
4115 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
4116 vcodec, acodec = None, None
4117 for full_codec in splited_codecs:
4118 codec = full_codec.split('.')[0]
4119 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4122 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4126 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4127 if not vcodec and not acodec:
4128 if len(splited_codecs) == 2:
4130 'vcodec': splited_codecs[0],
4131 'acodec': splited_codecs[1],
4135 'vcodec': vcodec or 'none',
4136 'acodec': acodec or 'none',
4141 def urlhandle_detect_ext(url_handle):
4142 getheader = url_handle.headers.get
4144 cd = getheader('Content-Disposition')
4146 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
4148 e = determine_ext(m.group('filename'), default_ext=None)
4152 return mimetype2ext(getheader('Content-Type'))
4155 def encode_data_uri(data, mime_type):
4156 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
4159 def age_restricted(content_limit, age_limit):
4160 """ Returns True iff the content should be blocked """
4162 if age_limit is None: # No limit set
4164 if content_limit is None:
4165 return False # Content available for everyone
4166 return age_limit < content_limit
4169 def is_html(first_bytes):
4170 """ Detect whether a file contains HTML by examining its first bytes. """
4173 (b'\xef\xbb\xbf', 'utf-8'),
4174 (b'\x00\x00\xfe\xff', 'utf-32-be'),
4175 (b'\xff\xfe\x00\x00', 'utf-32-le'),
4176 (b'\xff\xfe', 'utf-16-le'),
4177 (b'\xfe\xff', 'utf-16-be'),
4179 for bom, enc in BOMS:
4180 if first_bytes.startswith(bom):
4181 s = first_bytes[len(bom):].decode(enc, 'replace')
4184 s = first_bytes.decode('utf-8', 'replace')
4186 return re.match(r'^\s*<', s)
4189 def determine_protocol(info_dict):
4190 protocol = info_dict.get('protocol')
4191 if protocol is not None:
4194 url = info_dict['url']
4195 if url.startswith('rtmp'):
4197 elif url.startswith('mms'):
4199 elif url.startswith('rtsp'):
4202 ext = determine_ext(url)
4208 return compat_urllib_parse_urlparse(url).scheme
4211 def render_table(header_row, data):
4212 """ Render a list of rows, each as a list of values """
4213 table = [header_row] + data
4214 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
4215 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
4216 return '\n'.join(format_str % tuple(row) for row in table)
4219 def _match_one(filter_part, dct):
4220 COMPARISON_OPERATORS = {
4228 operator_rex = re.compile(r'''(?x)\s*
4230 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4232 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4233 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
4234 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
4237 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4238 m = operator_rex.search(filter_part)
4240 op = COMPARISON_OPERATORS[m.group('op')]
4241 actual_value = dct.get(m.group('key'))
4242 if (m.group('quotedstrval') is not None
4243 or m.group('strval') is not None
4244 # If the original field is a string and matching comparisonvalue is
4245 # a number we should respect the origin of the original field
4246 # and process comparison value as a string (see
4247 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4248 or actual_value is not None and m.group('intval') is not None
4249 and isinstance(actual_value, compat_str)):
4250 if m.group('op') not in ('=', '!='):
4252 'Operator %s does not support string values!' % m.group('op'))
4253 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4254 quote = m.group('quote')
4255 if quote is not None:
4256 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4259 comparison_value = int(m.group('intval'))
4261 comparison_value = parse_filesize(m.group('intval'))
4262 if comparison_value is None:
4263 comparison_value = parse_filesize(m.group('intval') + 'B')
4264 if comparison_value is None:
4266 'Invalid integer value %r in filter part %r' % (
4267 m.group('intval'), filter_part))
4268 if actual_value is None:
4269 return m.group('none_inclusive')
4270 return op(actual_value, comparison_value)
4273 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4274 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4276 operator_rex = re.compile(r'''(?x)\s*
4277 (?P<op>%s)\s*(?P<key>[a-z_]+)
4279 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4280 m = operator_rex.search(filter_part)
4282 op = UNARY_OPERATORS[m.group('op')]
4283 actual_value = dct.get(m.group('key'))
4284 return op(actual_value)
4286 raise ValueError('Invalid filter part %r' % filter_part)
4289 def match_str(filter_str, dct):
4290 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4293 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4296 def match_filter_func(filter_str):
4297 def _match_func(info_dict):
4298 if match_str(filter_str, info_dict):
4301 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4302 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4306 def parse_dfxp_time_expr(time_expr):
4310 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4312 return float(mobj.group('time_offset'))
4314 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4316 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4319 def srt_subtitles_timecode(seconds):
4320 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4323 def dfxp2srt(dfxp_data):
4325 @param dfxp_data A bytes-like object containing DFXP data
4326 @returns A unicode object containing converted SRT data
4328 LEGACY_NAMESPACES = (
4329 (b'http://www.w3.org/ns/ttml', [
4330 b'http://www.w3.org/2004/11/ttaf1',
4331 b'http://www.w3.org/2006/04/ttaf1',
4332 b'http://www.w3.org/2006/10/ttaf1',
4334 (b'http://www.w3.org/ns/ttml#styling', [
4335 b'http://www.w3.org/ns/ttml#style',
4339 SUPPORTED_STYLING = [
4348 _x = functools.partial(xpath_with_ns, ns_map={
4349 'xml': 'http://www.w3.org/XML/1998/namespace',
4350 'ttml': 'http://www.w3.org/ns/ttml',
4351 'tts': 'http://www.w3.org/ns/ttml#styling',
4357 class TTMLPElementParser(object):
4359 _unclosed_elements = []
4360 _applied_styles = []
4362 def start(self, tag, attrib):
4363 if tag in (_x('ttml:br'), 'br'):
4366 unclosed_elements = []
4368 element_style_id = attrib.get('style')
4370 style.update(default_style)
4371 if element_style_id:
4372 style.update(styles.get(element_style_id, {}))
4373 for prop in SUPPORTED_STYLING:
4374 prop_val = attrib.get(_x('tts:' + prop))
4376 style[prop] = prop_val
4379 for k, v in sorted(style.items()):
4380 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4383 font += ' color="%s"' % v
4384 elif k == 'fontSize':
4385 font += ' size="%s"' % v
4386 elif k == 'fontFamily':
4387 font += ' face="%s"' % v
4388 elif k == 'fontWeight' and v == 'bold':
4390 unclosed_elements.append('b')
4391 elif k == 'fontStyle' and v == 'italic':
4393 unclosed_elements.append('i')
4394 elif k == 'textDecoration' and v == 'underline':
4396 unclosed_elements.append('u')
4398 self._out += '<font' + font + '>'
4399 unclosed_elements.append('font')
4401 if self._applied_styles:
4402 applied_style.update(self._applied_styles[-1])
4403 applied_style.update(style)
4404 self._applied_styles.append(applied_style)
4405 self._unclosed_elements.append(unclosed_elements)
4408 if tag not in (_x('ttml:br'), 'br'):
4409 unclosed_elements = self._unclosed_elements.pop()
4410 for element in reversed(unclosed_elements):
4411 self._out += '</%s>' % element
4412 if unclosed_elements and self._applied_styles:
4413 self._applied_styles.pop()
4415 def data(self, data):
4419 return self._out.strip()
4421 def parse_node(node):
4422 target = TTMLPElementParser()
4423 parser = xml.etree.ElementTree.XMLParser(target=target)
4424 parser.feed(xml.etree.ElementTree.tostring(node))
4425 return parser.close()
4427 for k, v in LEGACY_NAMESPACES:
4429 dfxp_data = dfxp_data.replace(ns, k)
4431 dfxp = compat_etree_fromstring(dfxp_data)
4433 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4436 raise ValueError('Invalid dfxp/TTML subtitle')
4440 for style in dfxp.findall(_x('.//ttml:style')):
4441 style_id = style.get('id') or style.get(_x('xml:id'))
4444 parent_style_id = style.get('style')
4446 if parent_style_id not in styles:
4449 styles[style_id] = styles[parent_style_id].copy()
4450 for prop in SUPPORTED_STYLING:
4451 prop_val = style.get(_x('tts:' + prop))
4453 styles.setdefault(style_id, {})[prop] = prop_val
4459 for p in ('body', 'div'):
4460 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4463 style = styles.get(ele.get('style'))
4466 default_style.update(style)
4468 for para, index in zip(paras, itertools.count(1)):
4469 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4470 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4471 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4472 if begin_time is None:
4477 end_time = begin_time + dur
4478 out.append('%d\n%s --> %s\n%s\n\n' % (
4480 srt_subtitles_timecode(begin_time),
4481 srt_subtitles_timecode(end_time),
4487 def cli_option(params, command_option, param):
4488 param = params.get(param)
4490 param = compat_str(param)
4491 return [command_option, param] if param is not None else []
4494 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4495 param = params.get(param)
4498 assert isinstance(param, bool)
4500 return [command_option + separator + (true_value if param else false_value)]
4501 return [command_option, true_value if param else false_value]
4504 def cli_valueless_option(params, command_option, param, expected_value=True):
4505 param = params.get(param)
4506 return [command_option] if param == expected_value else []
4509 def cli_configuration_args(params, param, default=[]):
4510 ex_args = params.get(param)
4513 assert isinstance(ex_args, list)
4517 class ISO639Utils(object):
4518 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4577 'iw': 'heb', # Replaced by he in 1989 revision
4587 'in': 'ind', # Replaced by id in 1989 revision
4702 'ji': 'yid', # Replaced by yi in 1989 revision
4710 def short2long(cls, code):
4711 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4712 return cls._lang_map.get(code[:2])
4715 def long2short(cls, code):
4716 """Convert language code from ISO 639-2/T to ISO 639-1"""
4717 for short_name, long_name in cls._lang_map.items():
4718 if long_name == code:
4722 class ISO3166Utils(object):
4723 # From http://data.okfn.org/data/core/country-list
4725 'AF': 'Afghanistan',
4726 'AX': 'Åland Islands',
4729 'AS': 'American Samoa',
4734 'AG': 'Antigua and Barbuda',
4751 'BO': 'Bolivia, Plurinational State of',
4752 'BQ': 'Bonaire, Sint Eustatius and Saba',
4753 'BA': 'Bosnia and Herzegovina',
4755 'BV': 'Bouvet Island',
4757 'IO': 'British Indian Ocean Territory',
4758 'BN': 'Brunei Darussalam',
4760 'BF': 'Burkina Faso',
4766 'KY': 'Cayman Islands',
4767 'CF': 'Central African Republic',
4771 'CX': 'Christmas Island',
4772 'CC': 'Cocos (Keeling) Islands',
4776 'CD': 'Congo, the Democratic Republic of the',
4777 'CK': 'Cook Islands',
4779 'CI': 'Côte d\'Ivoire',
4784 'CZ': 'Czech Republic',
4788 'DO': 'Dominican Republic',
4791 'SV': 'El Salvador',
4792 'GQ': 'Equatorial Guinea',
4796 'FK': 'Falkland Islands (Malvinas)',
4797 'FO': 'Faroe Islands',
4801 'GF': 'French Guiana',
4802 'PF': 'French Polynesia',
4803 'TF': 'French Southern Territories',
4818 'GW': 'Guinea-Bissau',
4821 'HM': 'Heard Island and McDonald Islands',
4822 'VA': 'Holy See (Vatican City State)',
4829 'IR': 'Iran, Islamic Republic of',
4832 'IM': 'Isle of Man',
4842 'KP': 'Korea, Democratic People\'s Republic of',
4843 'KR': 'Korea, Republic of',
4846 'LA': 'Lao People\'s Democratic Republic',
4852 'LI': 'Liechtenstein',
4856 'MK': 'Macedonia, the Former Yugoslav Republic of',
4863 'MH': 'Marshall Islands',
4869 'FM': 'Micronesia, Federated States of',
4870 'MD': 'Moldova, Republic of',
4881 'NL': 'Netherlands',
4882 'NC': 'New Caledonia',
4883 'NZ': 'New Zealand',
4888 'NF': 'Norfolk Island',
4889 'MP': 'Northern Mariana Islands',
4894 'PS': 'Palestine, State of',
4896 'PG': 'Papua New Guinea',
4899 'PH': 'Philippines',
4903 'PR': 'Puerto Rico',
4907 'RU': 'Russian Federation',
4909 'BL': 'Saint Barthélemy',
4910 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4911 'KN': 'Saint Kitts and Nevis',
4912 'LC': 'Saint Lucia',
4913 'MF': 'Saint Martin (French part)',
4914 'PM': 'Saint Pierre and Miquelon',
4915 'VC': 'Saint Vincent and the Grenadines',
4918 'ST': 'Sao Tome and Principe',
4919 'SA': 'Saudi Arabia',
4923 'SL': 'Sierra Leone',
4925 'SX': 'Sint Maarten (Dutch part)',
4928 'SB': 'Solomon Islands',
4930 'ZA': 'South Africa',
4931 'GS': 'South Georgia and the South Sandwich Islands',
4932 'SS': 'South Sudan',
4937 'SJ': 'Svalbard and Jan Mayen',
4940 'CH': 'Switzerland',
4941 'SY': 'Syrian Arab Republic',
4942 'TW': 'Taiwan, Province of China',
4944 'TZ': 'Tanzania, United Republic of',
4946 'TL': 'Timor-Leste',
4950 'TT': 'Trinidad and Tobago',
4953 'TM': 'Turkmenistan',
4954 'TC': 'Turks and Caicos Islands',
4958 'AE': 'United Arab Emirates',
4959 'GB': 'United Kingdom',
4960 'US': 'United States',
4961 'UM': 'United States Minor Outlying Islands',
4965 'VE': 'Venezuela, Bolivarian Republic of',
4967 'VG': 'Virgin Islands, British',
4968 'VI': 'Virgin Islands, U.S.',
4969 'WF': 'Wallis and Futuna',
4970 'EH': 'Western Sahara',
4977 def short2full(cls, code):
4978 """Convert an ISO 3166-2 country code to the corresponding full name"""
4979 return cls._country_map.get(code.upper())
4982 class GeoUtils(object):
4983 # Major IPv4 address blocks per country
4985 'AD': '46.172.224.0/19',
4986 'AE': '94.200.0.0/13',
4987 'AF': '149.54.0.0/17',
4988 'AG': '209.59.64.0/18',
4989 'AI': '204.14.248.0/21',
4990 'AL': '46.99.0.0/16',
4991 'AM': '46.70.0.0/15',
4992 'AO': '105.168.0.0/13',
4993 'AP': '182.50.184.0/21',
4994 'AQ': '23.154.160.0/24',
4995 'AR': '181.0.0.0/12',
4996 'AS': '202.70.112.0/20',
4997 'AT': '77.116.0.0/14',
4998 'AU': '1.128.0.0/11',
4999 'AW': '181.41.0.0/18',
5000 'AX': '185.217.4.0/22',
5001 'AZ': '5.197.0.0/16',
5002 'BA': '31.176.128.0/17',
5003 'BB': '65.48.128.0/17',
5004 'BD': '114.130.0.0/16',
5006 'BF': '102.178.0.0/15',
5007 'BG': '95.42.0.0/15',
5008 'BH': '37.131.0.0/17',
5009 'BI': '154.117.192.0/18',
5010 'BJ': '137.255.0.0/16',
5011 'BL': '185.212.72.0/23',
5012 'BM': '196.12.64.0/18',
5013 'BN': '156.31.0.0/16',
5014 'BO': '161.56.0.0/16',
5015 'BQ': '161.0.80.0/20',
5016 'BR': '191.128.0.0/12',
5017 'BS': '24.51.64.0/18',
5018 'BT': '119.2.96.0/19',
5019 'BW': '168.167.0.0/16',
5020 'BY': '178.120.0.0/13',
5021 'BZ': '179.42.192.0/18',
5022 'CA': '99.224.0.0/11',
5023 'CD': '41.243.0.0/16',
5024 'CF': '197.242.176.0/21',
5025 'CG': '160.113.0.0/16',
5026 'CH': '85.0.0.0/13',
5027 'CI': '102.136.0.0/14',
5028 'CK': '202.65.32.0/19',
5029 'CL': '152.172.0.0/14',
5030 'CM': '102.244.0.0/14',
5031 'CN': '36.128.0.0/10',
5032 'CO': '181.240.0.0/12',
5033 'CR': '201.192.0.0/12',
5034 'CU': '152.206.0.0/15',
5035 'CV': '165.90.96.0/19',
5036 'CW': '190.88.128.0/17',
5037 'CY': '31.153.0.0/16',
5038 'CZ': '88.100.0.0/14',
5040 'DJ': '197.241.0.0/17',
5041 'DK': '87.48.0.0/12',
5042 'DM': '192.243.48.0/20',
5043 'DO': '152.166.0.0/15',
5044 'DZ': '41.96.0.0/12',
5045 'EC': '186.68.0.0/15',
5046 'EE': '90.190.0.0/15',
5047 'EG': '156.160.0.0/11',
5048 'ER': '196.200.96.0/20',
5049 'ES': '88.0.0.0/11',
5050 'ET': '196.188.0.0/14',
5051 'EU': '2.16.0.0/13',
5052 'FI': '91.152.0.0/13',
5053 'FJ': '144.120.0.0/16',
5054 'FK': '80.73.208.0/21',
5055 'FM': '119.252.112.0/20',
5056 'FO': '88.85.32.0/19',
5058 'GA': '41.158.0.0/15',
5060 'GD': '74.122.88.0/21',
5061 'GE': '31.146.0.0/16',
5062 'GF': '161.22.64.0/18',
5063 'GG': '62.68.160.0/19',
5064 'GH': '154.160.0.0/12',
5065 'GI': '95.164.0.0/16',
5066 'GL': '88.83.0.0/19',
5067 'GM': '160.182.0.0/15',
5068 'GN': '197.149.192.0/18',
5069 'GP': '104.250.0.0/19',
5070 'GQ': '105.235.224.0/20',
5071 'GR': '94.64.0.0/13',
5072 'GT': '168.234.0.0/16',
5073 'GU': '168.123.0.0/16',
5074 'GW': '197.214.80.0/20',
5075 'GY': '181.41.64.0/18',
5076 'HK': '113.252.0.0/14',
5077 'HN': '181.210.0.0/16',
5078 'HR': '93.136.0.0/13',
5079 'HT': '148.102.128.0/17',
5080 'HU': '84.0.0.0/14',
5081 'ID': '39.192.0.0/10',
5082 'IE': '87.32.0.0/12',
5083 'IL': '79.176.0.0/13',
5084 'IM': '5.62.80.0/20',
5085 'IN': '117.192.0.0/10',
5086 'IO': '203.83.48.0/21',
5087 'IQ': '37.236.0.0/14',
5088 'IR': '2.176.0.0/12',
5089 'IS': '82.221.0.0/16',
5090 'IT': '79.0.0.0/10',
5091 'JE': '87.244.64.0/18',
5092 'JM': '72.27.0.0/17',
5093 'JO': '176.29.0.0/16',
5094 'JP': '133.0.0.0/8',
5095 'KE': '105.48.0.0/12',
5096 'KG': '158.181.128.0/17',
5097 'KH': '36.37.128.0/17',
5098 'KI': '103.25.140.0/22',
5099 'KM': '197.255.224.0/20',
5100 'KN': '198.167.192.0/19',
5101 'KP': '175.45.176.0/22',
5102 'KR': '175.192.0.0/10',
5103 'KW': '37.36.0.0/14',
5104 'KY': '64.96.0.0/15',
5105 'KZ': '2.72.0.0/13',
5106 'LA': '115.84.64.0/18',
5107 'LB': '178.135.0.0/16',
5108 'LC': '24.92.144.0/20',
5109 'LI': '82.117.0.0/19',
5110 'LK': '112.134.0.0/15',
5111 'LR': '102.183.0.0/16',
5112 'LS': '129.232.0.0/17',
5113 'LT': '78.56.0.0/13',
5114 'LU': '188.42.0.0/16',
5115 'LV': '46.109.0.0/16',
5116 'LY': '41.252.0.0/14',
5117 'MA': '105.128.0.0/11',
5118 'MC': '88.209.64.0/18',
5119 'MD': '37.246.0.0/16',
5120 'ME': '178.175.0.0/17',
5121 'MF': '74.112.232.0/21',
5122 'MG': '154.126.0.0/17',
5123 'MH': '117.103.88.0/21',
5124 'MK': '77.28.0.0/15',
5125 'ML': '154.118.128.0/18',
5126 'MM': '37.111.0.0/17',
5127 'MN': '49.0.128.0/17',
5128 'MO': '60.246.0.0/16',
5129 'MP': '202.88.64.0/20',
5130 'MQ': '109.203.224.0/19',
5131 'MR': '41.188.64.0/18',
5132 'MS': '208.90.112.0/22',
5133 'MT': '46.11.0.0/16',
5134 'MU': '105.16.0.0/12',
5135 'MV': '27.114.128.0/18',
5136 'MW': '102.70.0.0/15',
5137 'MX': '187.192.0.0/11',
5138 'MY': '175.136.0.0/13',
5139 'MZ': '197.218.0.0/15',
5140 'NA': '41.182.0.0/16',
5141 'NC': '101.101.0.0/18',
5142 'NE': '197.214.0.0/18',
5143 'NF': '203.17.240.0/22',
5144 'NG': '105.112.0.0/12',
5145 'NI': '186.76.0.0/15',
5146 'NL': '145.96.0.0/11',
5147 'NO': '84.208.0.0/13',
5148 'NP': '36.252.0.0/15',
5149 'NR': '203.98.224.0/19',
5150 'NU': '49.156.48.0/22',
5151 'NZ': '49.224.0.0/14',
5152 'OM': '5.36.0.0/15',
5153 'PA': '186.72.0.0/15',
5154 'PE': '186.160.0.0/14',
5155 'PF': '123.50.64.0/18',
5156 'PG': '124.240.192.0/19',
5157 'PH': '49.144.0.0/13',
5158 'PK': '39.32.0.0/11',
5159 'PL': '83.0.0.0/11',
5160 'PM': '70.36.0.0/20',
5161 'PR': '66.50.0.0/16',
5162 'PS': '188.161.0.0/16',
5163 'PT': '85.240.0.0/13',
5164 'PW': '202.124.224.0/20',
5165 'PY': '181.120.0.0/14',
5166 'QA': '37.210.0.0/15',
5167 'RE': '102.35.0.0/16',
5168 'RO': '79.112.0.0/13',
5169 'RS': '93.86.0.0/15',
5170 'RU': '5.136.0.0/13',
5171 'RW': '41.186.0.0/16',
5172 'SA': '188.48.0.0/13',
5173 'SB': '202.1.160.0/19',
5174 'SC': '154.192.0.0/11',
5175 'SD': '102.120.0.0/13',
5176 'SE': '78.64.0.0/12',
5177 'SG': '8.128.0.0/10',
5178 'SI': '188.196.0.0/14',
5179 'SK': '78.98.0.0/15',
5180 'SL': '102.143.0.0/17',
5181 'SM': '89.186.32.0/19',
5182 'SN': '41.82.0.0/15',
5183 'SO': '154.115.192.0/18',
5184 'SR': '186.179.128.0/17',
5185 'SS': '105.235.208.0/21',
5186 'ST': '197.159.160.0/19',
5187 'SV': '168.243.0.0/16',
5188 'SX': '190.102.0.0/20',
5190 'SZ': '41.84.224.0/19',
5191 'TC': '65.255.48.0/20',
5192 'TD': '154.68.128.0/19',
5193 'TG': '196.168.0.0/14',
5194 'TH': '171.96.0.0/13',
5195 'TJ': '85.9.128.0/18',
5196 'TK': '27.96.24.0/21',
5197 'TL': '180.189.160.0/20',
5198 'TM': '95.85.96.0/19',
5199 'TN': '197.0.0.0/11',
5200 'TO': '175.176.144.0/21',
5201 'TR': '78.160.0.0/11',
5202 'TT': '186.44.0.0/15',
5203 'TV': '202.2.96.0/19',
5204 'TW': '120.96.0.0/11',
5205 'TZ': '156.156.0.0/14',
5206 'UA': '37.52.0.0/14',
5207 'UG': '102.80.0.0/13',
5209 'UY': '167.56.0.0/13',
5210 'UZ': '84.54.64.0/18',
5211 'VA': '212.77.0.0/19',
5212 'VC': '207.191.240.0/21',
5213 'VE': '186.88.0.0/13',
5214 'VG': '66.81.192.0/20',
5215 'VI': '146.226.0.0/16',
5216 'VN': '14.160.0.0/11',
5217 'VU': '202.80.32.0/20',
5218 'WF': '117.20.32.0/21',
5219 'WS': '202.4.32.0/19',
5220 'YE': '134.35.0.0/16',
5221 'YT': '41.242.116.0/22',
5222 'ZA': '41.0.0.0/11',
5223 'ZM': '102.144.0.0/13',
5224 'ZW': '102.177.192.0/18',
5228 def random_ipv4(cls, code_or_block):
5229 if len(code_or_block) == 2:
5230 block = cls._country_ip_map.get(code_or_block.upper())
5234 block = code_or_block
5235 addr, preflen = block.split('/')
5236 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5237 addr_max = addr_min | (0xffffffff >> int(preflen))
5238 return compat_str(socket.inet_ntoa(
5239 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5242 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5243 def __init__(self, proxies=None):
5244 # Set default handlers
5245 for type in ('http', 'https'):
5246 setattr(self, '%s_open' % type,
5247 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5248 meth(r, proxy, type))
5249 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5251 def proxy_open(self, req, proxy, type):
5252 req_proxy = req.headers.get('Ytdl-request-proxy')
5253 if req_proxy is not None:
5255 del req.headers['Ytdl-request-proxy']
5257 if proxy == '__noproxy__':
5258 return None # No Proxy
5259 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5260 req.add_header('Ytdl-socks-proxy', proxy)
5261 # youtube-dl's http/https handlers do wrapping the socket with socks
5263 return compat_urllib_request.ProxyHandler.proxy_open(
5264 self, req, proxy, type)
5267 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5268 # released into Public Domain
5269 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5271 def long_to_bytes(n, blocksize=0):
5272 """long_to_bytes(n:long, blocksize:int) : string
5273 Convert a long integer to a byte string.
5275 If optional blocksize is given and greater than zero, pad the front of the
5276 byte string with binary zeros so that the length is a multiple of
5279 # after much testing, this algorithm was deemed to be the fastest
5283 s = compat_struct_pack('>I', n & 0xffffffff) + s
5285 # strip off leading zeros
5286 for i in range(len(s)):
5287 if s[i] != b'\000'[0]:
5290 # only happens when n == 0
5294 # add back some pad bytes. this could be done more efficiently w.r.t. the
5295 # de-padding being done above, but sigh...
5296 if blocksize > 0 and len(s) % blocksize:
5297 s = (blocksize - len(s) % blocksize) * b'\000' + s
5301 def bytes_to_long(s):
5302 """bytes_to_long(string) : long
5303 Convert a byte string to a long integer.
5305 This is (essentially) the inverse of long_to_bytes().
5310 extra = (4 - length % 4)
5311 s = b'\000' * extra + s
5312 length = length + extra
5313 for i in range(0, length, 4):
5314 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5318 def ohdave_rsa_encrypt(data, exponent, modulus):
5320 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
5323 data: data to encrypt, bytes-like object
5324 exponent, modulus: parameter e and N of RSA algorithm, both integer
5325 Output: hex string of encrypted data
5327 Limitation: supports one block encryption only
5330 payload = int(binascii.hexlify(data[::-1]), 16)
5331 encrypted = pow(payload, exponent, modulus)
5332 return '%x' % encrypted
5335 def pkcs1pad(data, length):
5337 Padding input data with PKCS#1 scheme
5339 @param {int[]} data input data
5340 @param {int} length target length
5341 @returns {int[]} padded data
5343 if len(data) > length - 11:
5344 raise ValueError('Input data too long for PKCS#1 padding')
5346 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
5347 return [0, 2] + pseudo_random + [0] + data
5350 def encode_base_n(num, n, table=None):
5351 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5353 table = FULL_TABLE[:n]
5356 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
5363 ret = table[num % n] + ret
5368 def decode_packed_codes(code):
5369 mobj = re.search(PACKED_CODES_RE, code)
5370 obfucasted_code, base, count, symbols = mobj.groups()
5373 symbols = symbols.split('|')
5378 base_n_count = encode_base_n(count, base)
5379 symbol_table[base_n_count] = symbols[count] or base_n_count
5382 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5386 def caesar(s, alphabet, shift):
5391 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5396 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5399 def parse_m3u8_attributes(attrib):
5401 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5402 if val.startswith('"'):
5408 def urshift(val, n):
5409 return val >> n if val >= 0 else (val + 0x100000000) >> n
5412 # Based on png2str() written by @gdkchan and improved by @yokrysty
5413 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5414 def decode_png(png_data):
5415 # Reference: https://www.w3.org/TR/PNG/
5416 header = png_data[8:]
5418 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5419 raise IOError('Not a valid PNG file.')
5421 int_map = {1: '>B', 2: '>H', 4: '>I'}
5422 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
5427 length = unpack_integer(header[:4])
5430 chunk_type = header[:4]
5433 chunk_data = header[:length]
5434 header = header[length:]
5436 header = header[4:] # Skip CRC
5444 ihdr = chunks[0]['data']
5446 width = unpack_integer(ihdr[:4])
5447 height = unpack_integer(ihdr[4:8])
5451 for chunk in chunks:
5452 if chunk['type'] == b'IDAT':
5453 idat += chunk['data']
5456 raise IOError('Unable to read PNG data.')
5458 decompressed_data = bytearray(zlib.decompress(idat))
5463 def _get_pixel(idx):
5468 for y in range(height):
5469 basePos = y * (1 + stride)
5470 filter_type = decompressed_data[basePos]
5474 pixels.append(current_row)
5476 for x in range(stride):
5477 color = decompressed_data[1 + basePos + x]
5478 basex = y * stride + x
5483 left = _get_pixel(basex - 3)
5485 up = _get_pixel(basex - stride)
5487 if filter_type == 1: # Sub
5488 color = (color + left) & 0xff
5489 elif filter_type == 2: # Up
5490 color = (color + up) & 0xff
5491 elif filter_type == 3: # Average
5492 color = (color + ((left + up) >> 1)) & 0xff
5493 elif filter_type == 4: # Paeth
5499 c = _get_pixel(basex - stride - 3)
5507 if pa <= pb and pa <= pc:
5508 color = (color + a) & 0xff
5510 color = (color + b) & 0xff
5512 color = (color + c) & 0xff
5514 current_row.append(color)
5516 return width, height, pixels
5519 def write_xattr(path, key, value):
5520 # This mess below finds the best xattr tool for the job
5522 # try the pyxattr module...
5525 if hasattr(xattr, 'set'): # pyxattr
5526 # Unicode arguments are not supported in python-pyxattr until
5528 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5529 pyxattr_required_version = '0.5.0'
5530 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
5531 # TODO: fallback to CLI tools
5532 raise XAttrUnavailableError(
5533 'python-pyxattr is detected but is too old. '
5534 'youtube-dl requires %s or above while your version is %s. '
5535 'Falling back to other xattr implementations' % (
5536 pyxattr_required_version, xattr.__version__))
5538 setxattr = xattr.set
5540 setxattr = xattr.setxattr
5543 setxattr(path, key, value)
5544 except EnvironmentError as e:
5545 raise XAttrMetadataError(e.errno, e.strerror)
5548 if compat_os_name == 'nt':
5549 # Write xattrs to NTFS Alternate Data Streams:
5550 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5551 assert ':' not in key
5552 assert os.path.exists(path)
5554 ads_fn = path + ':' + key
5556 with open(ads_fn, 'wb') as f:
5558 except EnvironmentError as e:
5559 raise XAttrMetadataError(e.errno, e.strerror)
5561 user_has_setfattr = check_executable('setfattr', ['--version'])
5562 user_has_xattr = check_executable('xattr', ['-h'])
5564 if user_has_setfattr or user_has_xattr:
5566 value = value.decode('utf-8')
5567 if user_has_setfattr:
5568 executable = 'setfattr'
5569 opts = ['-n', key, '-v', value]
5570 elif user_has_xattr:
5571 executable = 'xattr'
5572 opts = ['-w', key, value]
5574 cmd = ([encodeFilename(executable, True)]
5575 + [encodeArgument(o) for o in opts]
5576 + [encodeFilename(path, True)])
5579 p = subprocess.Popen(
5580 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5581 except EnvironmentError as e:
5582 raise XAttrMetadataError(e.errno, e.strerror)
5583 stdout, stderr = p.communicate()
5584 stderr = stderr.decode('utf-8', 'replace')
5585 if p.returncode != 0:
5586 raise XAttrMetadataError(p.returncode, stderr)
5589 # On Unix, and can't find pyxattr, setfattr, or xattr.
5590 if sys.platform.startswith('linux'):
5591 raise XAttrUnavailableError(
5592 "Couldn't find a tool to set the xattrs. "
5593 "Install either the python 'pyxattr' or 'xattr' "
5594 "modules, or the GNU 'attr' package "
5595 "(which contains the 'setfattr' tool).")
5597 raise XAttrUnavailableError(
5598 "Couldn't find a tool to set the xattrs. "
5599 "Install either the python 'xattr' module, "
5600 "or the 'xattr' binary.")
5603 def random_birthday(year_field, month_field, day_field):
5604 start_date = datetime.date(1950, 1, 1)
5605 end_date = datetime.date(1995, 12, 31)
5606 offset = random.randint(0, (end_date - start_date).days)
5607 random_date = start_date + datetime.timedelta(offset)
5609 year_field: str(random_date.year),
5610 month_field: str(random_date.month),
5611 day_field: str(random_date.day),