4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
41 compat_etree_fromstring,
43 compat_html_entities_html5,
49 compat_socket_create_connection,
55 compat_urllib_parse_urlencode,
56 compat_urllib_parse_urlparse,
57 compat_urllib_parse_unquote_plus,
58 compat_urllib_request,
69 def register_socks_protocols():
70 # "Register" SOCKS protocols
71 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
72 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
73 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
74 if scheme not in compat_urlparse.uses_netloc:
75 compat_urlparse.uses_netloc.append(scheme)
78 # This is not clearly defined otherwise
79 compiled_regex_type = type(re.compile(''))
82 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
83 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
84 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
85 'Accept-Encoding': 'gzip, deflate',
86 'Accept-Language': 'en-us,en;q=0.5',
91 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
97 ENGLISH_MONTH_NAMES = [
98 'January', 'February', 'March', 'April', 'May', 'June',
99 'July', 'August', 'September', 'October', 'November', 'December']
102 'en': ENGLISH_MONTH_NAMES,
104 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
105 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
109 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
110 'flv', 'f4v', 'f4a', 'f4b',
111 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
112 'mkv', 'mka', 'mk3d',
121 'f4f', 'f4m', 'm3u8', 'smil')
123 # needed for sanitizing filenames in restricted mode
124 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
125 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
126 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
149 '%Y-%m-%d %H:%M:%S.%f',
152 '%Y-%m-%dT%H:%M:%SZ',
153 '%Y-%m-%dT%H:%M:%S.%fZ',
154 '%Y-%m-%dT%H:%M:%S.%f0Z',
156 '%Y-%m-%dT%H:%M:%S.%f',
159 '%b %d %Y at %H:%M:%S',
162 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
163 DATE_FORMATS_DAY_FIRST.extend([
172 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
173 DATE_FORMATS_MONTH_FIRST.extend([
181 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
184 def preferredencoding():
185 """Get preferred encoding.
187 Returns the best encoding scheme for the system, based on
188 locale.getpreferredencoding() and some further tweaks.
191 pref = locale.getpreferredencoding()
199 def write_json_file(obj, fn):
200 """ Encode obj as JSON and write it to fn, atomically if possible """
202 fn = encodeFilename(fn)
203 if sys.version_info < (3, 0) and sys.platform != 'win32':
204 encoding = get_filesystem_encoding()
205 # os.path.basename returns a bytes object, but NamedTemporaryFile
206 # will fail if the filename contains non ascii characters unless we
207 # use a unicode object
208 path_basename = lambda f: os.path.basename(fn).decode(encoding)
209 # the same for os.path.dirname
210 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
212 path_basename = os.path.basename
213 path_dirname = os.path.dirname
217 'prefix': path_basename(fn) + '.',
218 'dir': path_dirname(fn),
222 # In Python 2.x, json.dump expects a bytestream.
223 # In Python 3.x, it writes to a character stream
224 if sys.version_info < (3, 0):
232 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
237 if sys.platform == 'win32':
238 # Need to remove existing file on Windows, else os.rename raises
239 # WindowsError or FileExistsError.
244 os.rename(tf.name, fn)
253 if sys.version_info >= (2, 7):
254 def find_xpath_attr(node, xpath, key, val=None):
255 """ Find the xpath xpath[@key=val] """
256 assert re.match(r'^[a-zA-Z_-]+$', key)
257 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
258 return node.find(expr)
260 def find_xpath_attr(node, xpath, key, val=None):
261 for f in node.findall(compat_xpath(xpath)):
262 if key not in f.attrib:
264 if val is None or f.attrib.get(key) == val:
268 # On python2.6 the xml.etree.ElementTree.Element methods don't support
269 # the namespace parameter
272 def xpath_with_ns(path, ns_map):
273 components = [c.split(':') for c in path.split('/')]
277 replaced.append(c[0])
280 replaced.append('{%s}%s' % (ns_map[ns], tag))
281 return '/'.join(replaced)
284 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
285 def _find_xpath(xpath):
286 return node.find(compat_xpath(xpath))
288 if isinstance(xpath, (str, compat_str)):
289 n = _find_xpath(xpath)
297 if default is not NO_DEFAULT:
300 name = xpath if name is None else name
301 raise ExtractorError('Could not find XML element %s' % name)
307 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
308 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
309 if n is None or n == default:
312 if default is not NO_DEFAULT:
315 name = xpath if name is None else name
316 raise ExtractorError('Could not find XML element\'s text %s' % name)
322 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
323 n = find_xpath_attr(node, xpath, key)
325 if default is not NO_DEFAULT:
328 name = '%s[@%s]' % (xpath, key) if name is None else name
329 raise ExtractorError('Could not find XML attribute %s' % name)
335 def get_element_by_id(id, html):
336 """Return the content of the tag with the specified ID in the passed HTML document"""
337 return get_element_by_attribute('id', id, html)
340 def get_element_by_class(class_name, html):
341 """Return the content of the first tag with the specified class in the passed HTML document"""
342 retval = get_elements_by_class(class_name, html)
343 return retval[0] if retval else None
346 def get_element_by_attribute(attribute, value, html, escape_value=True):
347 retval = get_elements_by_attribute(attribute, value, html, escape_value)
348 return retval[0] if retval else None
351 def get_elements_by_class(class_name, html):
352 """Return the content of all tags with the specified class in the passed HTML document as a list"""
353 return get_elements_by_attribute(
354 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
355 html, escape_value=False)
358 def get_elements_by_attribute(attribute, value, html, escape_value=True):
359 """Return the content of the tag with the specified attribute in the passed HTML document"""
361 value = re.escape(value) if escape_value else value
364 for m in re.finditer(r'''(?xs)
366 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
368 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
372 ''' % (re.escape(attribute), value), html):
373 res = m.group('content')
375 if res.startswith('"') or res.startswith("'"):
378 retlist.append(unescapeHTML(res))
383 class HTMLAttributeParser(compat_HTMLParser):
384 """Trivial HTML parser to gather the attributes for a single element"""
387 compat_HTMLParser.__init__(self)
389 def handle_starttag(self, tag, attrs):
390 self.attrs = dict(attrs)
393 def extract_attributes(html_element):
394 """Given a string for an HTML element such as
396 a="foo" B="bar" c="&98;az" d=boz
397 empty= noval entity="&"
400 Decode and return a dictionary of attributes.
402 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
403 'empty': '', 'noval': None, 'entity': '&',
404 'sq': '"', 'dq': '\''
406 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
407 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
409 parser = HTMLAttributeParser()
410 parser.feed(html_element)
415 def clean_html(html):
416 """Clean an HTML snippet into a readable string"""
418 if html is None: # Convenience for sanitizing descriptions etc.
422 html = html.replace('\n', ' ')
423 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
424 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
426 html = re.sub('<.*?>', '', html)
427 # Replace html entities
428 html = unescapeHTML(html)
432 def sanitize_open(filename, open_mode):
433 """Try to open the given filename, and slightly tweak it if this fails.
435 Attempts to open the given filename. If this fails, it tries to change
436 the filename slightly, step by step, until it's either able to open it
437 or it fails and raises a final exception, like the standard open()
440 It returns the tuple (stream, definitive_file_name).
444 if sys.platform == 'win32':
446 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
447 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
448 stream = open(encodeFilename(filename), open_mode)
449 return (stream, filename)
450 except (IOError, OSError) as err:
451 if err.errno in (errno.EACCES,):
454 # In case of error, try to remove win32 forbidden chars
455 alt_filename = sanitize_path(filename)
456 if alt_filename == filename:
459 # An exception here should be caught in the caller
460 stream = open(encodeFilename(alt_filename), open_mode)
461 return (stream, alt_filename)
464 def timeconvert(timestr):
465 """Convert RFC 2822 defined time string into system timestamp"""
467 timetuple = email.utils.parsedate_tz(timestr)
468 if timetuple is not None:
469 timestamp = email.utils.mktime_tz(timetuple)
473 def sanitize_filename(s, restricted=False, is_id=False):
474 """Sanitizes a string so it could be used as part of a filename.
475 If restricted is set, use a stricter subset of allowed characters.
476 Set is_id if this is not an arbitrary string, but an ID that should be kept
479 def replace_insane(char):
480 if restricted and char in ACCENT_CHARS:
481 return ACCENT_CHARS[char]
482 if char == '?' or ord(char) < 32 or ord(char) == 127:
485 return '' if restricted else '\''
487 return '_-' if restricted else ' -'
488 elif char in '\\/|*<>':
490 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
492 if restricted and ord(char) > 127:
497 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
498 result = ''.join(map(replace_insane, s))
500 while '__' in result:
501 result = result.replace('__', '_')
502 result = result.strip('_')
503 # Common case of "Foreign band name - English song title"
504 if restricted and result.startswith('-_'):
506 if result.startswith('-'):
507 result = '_' + result[len('-'):]
508 result = result.lstrip('.')
514 def sanitize_path(s):
515 """Sanitizes and normalizes path on Windows"""
516 if sys.platform != 'win32':
518 drive_or_unc, _ = os.path.splitdrive(s)
519 if sys.version_info < (2, 7) and not drive_or_unc:
520 drive_or_unc, _ = os.path.splitunc(s)
521 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
525 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
526 for path_part in norm_path]
528 sanitized_path.insert(0, drive_or_unc + os.path.sep)
529 return os.path.join(*sanitized_path)
532 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
533 # unwanted failures due to missing protocol
534 def sanitize_url(url):
535 return 'http:%s' % url if url.startswith('//') else url
538 def sanitized_Request(url, *args, **kwargs):
539 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
542 def orderedSet(iterable):
543 """ Remove all duplicates from the input iterable """
551 def _htmlentity_transform(entity_with_semicolon):
552 """Transforms an HTML entity to a character."""
553 entity = entity_with_semicolon[:-1]
555 # Known non-numeric HTML entity
556 if entity in compat_html_entities.name2codepoint:
557 return compat_chr(compat_html_entities.name2codepoint[entity])
559 # TODO: HTML5 allows entities without a semicolon. For example,
560 # 'Éric' should be decoded as 'Éric'.
561 if entity_with_semicolon in compat_html_entities_html5:
562 return compat_html_entities_html5[entity_with_semicolon]
564 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
566 numstr = mobj.group(1)
567 if numstr.startswith('x'):
569 numstr = '0%s' % numstr
572 # See https://github.com/rg3/youtube-dl/issues/7518
574 return compat_chr(int(numstr, base))
578 # Unknown entity in name, return its literal representation
579 return '&%s;' % entity
585 assert type(s) == compat_str
588 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
591 def get_subprocess_encoding():
592 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
593 # For subprocess calls, encode with locale encoding
594 # Refer to http://stackoverflow.com/a/9951851/35070
595 encoding = preferredencoding()
597 encoding = sys.getfilesystemencoding()
603 def encodeFilename(s, for_subprocess=False):
605 @param s The name of the file
608 assert type(s) == compat_str
610 # Python 3 has a Unicode API
611 if sys.version_info >= (3, 0):
614 # Pass '' directly to use Unicode APIs on Windows 2000 and up
615 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
616 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
617 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
620 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
621 if sys.platform.startswith('java'):
624 return s.encode(get_subprocess_encoding(), 'ignore')
627 def decodeFilename(b, for_subprocess=False):
629 if sys.version_info >= (3, 0):
632 if not isinstance(b, bytes):
635 return b.decode(get_subprocess_encoding(), 'ignore')
638 def encodeArgument(s):
639 if not isinstance(s, compat_str):
640 # Legacy code that uses byte strings
641 # Uncomment the following line after fixing all post processors
642 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
643 s = s.decode('ascii')
644 return encodeFilename(s, True)
647 def decodeArgument(b):
648 return decodeFilename(b, True)
651 def decodeOption(optval):
654 if isinstance(optval, bytes):
655 optval = optval.decode(preferredencoding())
657 assert isinstance(optval, compat_str)
661 def formatSeconds(secs):
663 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
665 return '%d:%02d' % (secs // 60, secs % 60)
670 def make_HTTPS_handler(params, **kwargs):
671 opts_no_check_certificate = params.get('nocheckcertificate', False)
672 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
673 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
674 if opts_no_check_certificate:
675 context.check_hostname = False
676 context.verify_mode = ssl.CERT_NONE
678 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
681 # (create_default_context present but HTTPSHandler has no context=)
684 if sys.version_info < (3, 2):
685 return YoutubeDLHTTPSHandler(params, **kwargs)
687 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
688 context.verify_mode = (ssl.CERT_NONE
689 if opts_no_check_certificate
690 else ssl.CERT_REQUIRED)
691 context.set_default_verify_paths()
692 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
695 def bug_reports_message():
696 if ytdl_is_updateable():
697 update_cmd = 'type youtube-dl -U to update'
699 update_cmd = 'see https://yt-dl.org/update on how to update'
700 msg = '; please report this issue on https://yt-dl.org/bug .'
701 msg += ' Make sure you are using the latest version; %s.' % update_cmd
702 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
706 class YoutubeDLError(Exception):
707 """Base exception for YoutubeDL errors."""
711 class ExtractorError(YoutubeDLError):
712 """Error during info extraction."""
714 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
715 """ tb, if given, is the original traceback (so that it can be printed out).
716 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
719 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
721 if video_id is not None:
722 msg = video_id + ': ' + msg
724 msg += ' (caused by %r)' % cause
726 msg += bug_reports_message()
727 super(ExtractorError, self).__init__(msg)
730 self.exc_info = sys.exc_info() # preserve original exception
732 self.video_id = video_id
734 def format_traceback(self):
735 if self.traceback is None:
737 return ''.join(traceback.format_tb(self.traceback))
740 class UnsupportedError(ExtractorError):
741 def __init__(self, url):
742 super(UnsupportedError, self).__init__(
743 'Unsupported URL: %s' % url, expected=True)
747 class RegexNotFoundError(ExtractorError):
748 """Error when a regex didn't match"""
752 class GeoRestrictedError(ExtractorError):
753 """Geographic restriction Error exception.
755 This exception may be thrown when a video is not available from your
756 geographic location due to geographic restrictions imposed by a website.
758 def __init__(self, msg, countries=None):
759 super(GeoRestrictedError, self).__init__(msg, expected=True)
761 self.countries = countries
764 class DownloadError(YoutubeDLError):
765 """Download Error exception.
767 This exception may be thrown by FileDownloader objects if they are not
768 configured to continue on errors. They will contain the appropriate
772 def __init__(self, msg, exc_info=None):
773 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
774 super(DownloadError, self).__init__(msg)
775 self.exc_info = exc_info
778 class SameFileError(YoutubeDLError):
779 """Same File exception.
781 This exception will be thrown by FileDownloader objects if they detect
782 multiple files would have to be downloaded to the same file on disk.
787 class PostProcessingError(YoutubeDLError):
788 """Post Processing exception.
790 This exception may be raised by PostProcessor's .run() method to
791 indicate an error in the postprocessing task.
794 def __init__(self, msg):
795 super(PostProcessingError, self).__init__(msg)
799 class MaxDownloadsReached(YoutubeDLError):
800 """ --max-downloads limit has been reached. """
804 class UnavailableVideoError(YoutubeDLError):
805 """Unavailable Format exception.
807 This exception will be thrown when a video is requested
808 in a format that is not available for that video.
813 class ContentTooShortError(YoutubeDLError):
814 """Content Too Short exception.
816 This exception may be raised by FileDownloader objects when a file they
817 download is too small for what the server announced first, indicating
818 the connection was probably interrupted.
821 def __init__(self, downloaded, expected):
822 super(ContentTooShortError, self).__init__(
823 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
826 self.downloaded = downloaded
827 self.expected = expected
830 class XAttrMetadataError(YoutubeDLError):
831 def __init__(self, code=None, msg='Unknown error'):
832 super(XAttrMetadataError, self).__init__(msg)
836 # Parsing code and msg
837 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
838 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
839 self.reason = 'NO_SPACE'
840 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
841 self.reason = 'VALUE_TOO_LONG'
843 self.reason = 'NOT_SUPPORTED'
846 class XAttrUnavailableError(YoutubeDLError):
850 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
851 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
852 # expected HTTP responses to meet HTTP/1.0 or later (see also
853 # https://github.com/rg3/youtube-dl/issues/6727)
854 if sys.version_info < (3, 0):
855 kwargs[b'strict'] = True
856 hc = http_class(*args, **kwargs)
857 source_address = ydl_handler._params.get('source_address')
858 if source_address is not None:
859 sa = (source_address, 0)
860 if hasattr(hc, 'source_address'): # Python 2.7+
861 hc.source_address = sa
863 def _hc_connect(self, *args, **kwargs):
864 sock = compat_socket_create_connection(
865 (self.host, self.port), self.timeout, sa)
867 self.sock = ssl.wrap_socket(
868 sock, self.key_file, self.cert_file,
869 ssl_version=ssl.PROTOCOL_TLSv1)
872 hc.connect = functools.partial(_hc_connect, hc)
877 def handle_youtubedl_headers(headers):
878 filtered_headers = headers
880 if 'Youtubedl-no-compression' in filtered_headers:
881 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
882 del filtered_headers['Youtubedl-no-compression']
884 return filtered_headers
887 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
888 """Handler for HTTP requests and responses.
890 This class, when installed with an OpenerDirector, automatically adds
891 the standard headers to every HTTP request and handles gzipped and
892 deflated responses from web servers. If compression is to be avoided in
893 a particular request, the original request in the program code only has
894 to include the HTTP header "Youtubedl-no-compression", which will be
895 removed before making the real request.
897 Part of this code was copied from:
899 http://techknack.net/python-urllib2-handlers/
901 Andrew Rowls, the author of that code, agreed to release it to the
905 def __init__(self, params, *args, **kwargs):
906 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
907 self._params = params
909 def http_open(self, req):
910 conn_class = compat_http_client.HTTPConnection
912 socks_proxy = req.headers.get('Ytdl-socks-proxy')
914 conn_class = make_socks_conn_class(conn_class, socks_proxy)
915 del req.headers['Ytdl-socks-proxy']
917 return self.do_open(functools.partial(
918 _create_http_connection, self, conn_class, False),
924 return zlib.decompress(data, -zlib.MAX_WBITS)
926 return zlib.decompress(data)
929 def addinfourl_wrapper(stream, headers, url, code):
930 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
931 return compat_urllib_request.addinfourl(stream, headers, url, code)
932 ret = compat_urllib_request.addinfourl(stream, headers, url)
936 def http_request(self, req):
937 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
938 # always respected by websites, some tend to give out URLs with non percent-encoded
939 # non-ASCII characters (see telemb.py, ard.py [#3412])
940 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
941 # To work around aforementioned issue we will replace request's original URL with
942 # percent-encoded one
943 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
944 # the code of this workaround has been moved here from YoutubeDL.urlopen()
945 url = req.get_full_url()
946 url_escaped = escape_url(url)
948 # Substitute URL if any change after escaping
949 if url != url_escaped:
950 req = update_Request(req, url=url_escaped)
952 for h, v in std_headers.items():
953 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
954 # The dict keys are capitalized because of this bug by urllib
955 if h.capitalize() not in req.headers:
958 req.headers = handle_youtubedl_headers(req.headers)
960 if sys.version_info < (2, 7) and '#' in req.get_full_url():
961 # Python 2.6 is brain-dead when it comes to fragments
962 req._Request__original = req._Request__original.partition('#')[0]
963 req._Request__r_type = req._Request__r_type.partition('#')[0]
967 def http_response(self, req, resp):
970 if resp.headers.get('Content-encoding', '') == 'gzip':
971 content = resp.read()
972 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
974 uncompressed = io.BytesIO(gz.read())
975 except IOError as original_ioerror:
976 # There may be junk add the end of the file
977 # See http://stackoverflow.com/q/4928560/35070 for details
978 for i in range(1, 1024):
980 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
981 uncompressed = io.BytesIO(gz.read())
986 raise original_ioerror
987 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
988 resp.msg = old_resp.msg
989 del resp.headers['Content-encoding']
991 if resp.headers.get('Content-encoding', '') == 'deflate':
992 gz = io.BytesIO(self.deflate(resp.read()))
993 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
994 resp.msg = old_resp.msg
995 del resp.headers['Content-encoding']
996 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
997 # https://github.com/rg3/youtube-dl/issues/6457).
998 if 300 <= resp.code < 400:
999 location = resp.headers.get('Location')
1001 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1002 if sys.version_info >= (3, 0):
1003 location = location.encode('iso-8859-1').decode('utf-8')
1005 location = location.decode('utf-8')
1006 location_escaped = escape_url(location)
1007 if location != location_escaped:
1008 del resp.headers['Location']
1009 if sys.version_info < (3, 0):
1010 location_escaped = location_escaped.encode('utf-8')
1011 resp.headers['Location'] = location_escaped
1014 https_request = http_request
1015 https_response = http_response
1018 def make_socks_conn_class(base_class, socks_proxy):
1019 assert issubclass(base_class, (
1020 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1022 url_components = compat_urlparse.urlparse(socks_proxy)
1023 if url_components.scheme.lower() == 'socks5':
1024 socks_type = ProxyType.SOCKS5
1025 elif url_components.scheme.lower() in ('socks', 'socks4'):
1026 socks_type = ProxyType.SOCKS4
1027 elif url_components.scheme.lower() == 'socks4a':
1028 socks_type = ProxyType.SOCKS4A
1030 def unquote_if_non_empty(s):
1033 return compat_urllib_parse_unquote_plus(s)
1037 url_components.hostname, url_components.port or 1080,
1039 unquote_if_non_empty(url_components.username),
1040 unquote_if_non_empty(url_components.password),
1043 class SocksConnection(base_class):
1045 self.sock = sockssocket()
1046 self.sock.setproxy(*proxy_args)
1047 if type(self.timeout) in (int, float):
1048 self.sock.settimeout(self.timeout)
1049 self.sock.connect((self.host, self.port))
1051 if isinstance(self, compat_http_client.HTTPSConnection):
1052 if hasattr(self, '_context'): # Python > 2.6
1053 self.sock = self._context.wrap_socket(
1054 self.sock, server_hostname=self.host)
1056 self.sock = ssl.wrap_socket(self.sock)
1058 return SocksConnection
1061 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1062 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1063 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1064 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1065 self._params = params
1067 def https_open(self, req):
1069 conn_class = self._https_conn_class
1071 if hasattr(self, '_context'): # python > 2.6
1072 kwargs['context'] = self._context
1073 if hasattr(self, '_check_hostname'): # python 3.x
1074 kwargs['check_hostname'] = self._check_hostname
1076 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1078 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1079 del req.headers['Ytdl-socks-proxy']
1081 return self.do_open(functools.partial(
1082 _create_http_connection, self, conn_class, True),
1086 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1087 def __init__(self, cookiejar=None):
1088 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1090 def http_response(self, request, response):
1091 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1092 # characters in Set-Cookie HTTP header of last response (see
1093 # https://github.com/rg3/youtube-dl/issues/6769).
1094 # In order to at least prevent crashing we will percent encode Set-Cookie
1095 # header before HTTPCookieProcessor starts processing it.
1096 # if sys.version_info < (3, 0) and response.headers:
1097 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1098 # set_cookie = response.headers.get(set_cookie_header)
1100 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1101 # if set_cookie != set_cookie_escaped:
1102 # del response.headers[set_cookie_header]
1103 # response.headers[set_cookie_header] = set_cookie_escaped
1104 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1106 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1107 https_response = http_response
1110 def extract_timezone(date_str):
1112 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1115 timezone = datetime.timedelta()
1117 date_str = date_str[:-len(m.group('tz'))]
1118 if not m.group('sign'):
1119 timezone = datetime.timedelta()
1121 sign = 1 if m.group('sign') == '+' else -1
1122 timezone = datetime.timedelta(
1123 hours=sign * int(m.group('hours')),
1124 minutes=sign * int(m.group('minutes')))
1125 return timezone, date_str
1128 def parse_iso8601(date_str, delimiter='T', timezone=None):
1129 """ Return a UNIX timestamp from the given date """
1131 if date_str is None:
1134 date_str = re.sub(r'\.[0-9]+', '', date_str)
1136 if timezone is None:
1137 timezone, date_str = extract_timezone(date_str)
1140 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1141 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1142 return calendar.timegm(dt.timetuple())
1147 def date_formats(day_first=True):
1148 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1151 def unified_strdate(date_str, day_first=True):
1152 """Return a string with the date in the format YYYYMMDD"""
1154 if date_str is None:
1158 date_str = date_str.replace(',', ' ')
1159 # Remove AM/PM + timezone
1160 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1161 _, date_str = extract_timezone(date_str)
1163 for expression in date_formats(day_first):
1165 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1168 if upload_date is None:
1169 timetuple = email.utils.parsedate_tz(date_str)
1172 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1175 if upload_date is not None:
1176 return compat_str(upload_date)
1179 def unified_timestamp(date_str, day_first=True):
1180 if date_str is None:
1183 date_str = date_str.replace(',', ' ')
1185 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1186 timezone, date_str = extract_timezone(date_str)
1188 # Remove AM/PM + timezone
1189 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1191 for expression in date_formats(day_first):
1193 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1194 return calendar.timegm(dt.timetuple())
1197 timetuple = email.utils.parsedate_tz(date_str)
1199 return calendar.timegm(timetuple) + pm_delta * 3600
1202 def determine_ext(url, default_ext='unknown_video'):
1205 guess = url.partition('?')[0].rpartition('.')[2]
1206 if re.match(r'^[A-Za-z0-9]+$', guess):
1208 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1209 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1210 return guess.rstrip('/')
1215 def subtitles_filename(filename, sub_lang, sub_format):
1216 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1219 def date_from_str(date_str):
1221 Return a datetime object from a string in the format YYYYMMDD or
1222 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1223 today = datetime.date.today()
1224 if date_str in ('now', 'today'):
1226 if date_str == 'yesterday':
1227 return today - datetime.timedelta(days=1)
1228 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1229 if match is not None:
1230 sign = match.group('sign')
1231 time = int(match.group('time'))
1234 unit = match.group('unit')
1235 # A bad approximation?
1239 elif unit == 'year':
1243 delta = datetime.timedelta(**{unit: time})
1244 return today + delta
1245 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1248 def hyphenate_date(date_str):
1250 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1251 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1252 if match is not None:
1253 return '-'.join(match.groups())
1258 class DateRange(object):
1259 """Represents a time interval between two dates"""
1261 def __init__(self, start=None, end=None):
1262 """start and end must be strings in the format accepted by date"""
1263 if start is not None:
1264 self.start = date_from_str(start)
1266 self.start = datetime.datetime.min.date()
1268 self.end = date_from_str(end)
1270 self.end = datetime.datetime.max.date()
1271 if self.start > self.end:
1272 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1276 """Returns a range that only contains the given day"""
1277 return cls(day, day)
1279 def __contains__(self, date):
1280 """Check if the date is in the range"""
1281 if not isinstance(date, datetime.date):
1282 date = date_from_str(date)
1283 return self.start <= date <= self.end
1286 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1289 def platform_name():
1290 """ Returns the platform name as a compat_str """
1291 res = platform.platform()
1292 if isinstance(res, bytes):
1293 res = res.decode(preferredencoding())
1295 assert isinstance(res, compat_str)
1299 def _windows_write_string(s, out):
1300 """ Returns True if the string was written using special methods,
1301 False if it has yet to be written out."""
1302 # Adapted from http://stackoverflow.com/a/3259271/35070
1305 import ctypes.wintypes
1313 fileno = out.fileno()
1314 except AttributeError:
1315 # If the output stream doesn't have a fileno, it's virtual
1317 except io.UnsupportedOperation:
1318 # Some strange Windows pseudo files?
1320 if fileno not in WIN_OUTPUT_IDS:
1323 GetStdHandle = ctypes.WINFUNCTYPE(
1324 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1325 (b'GetStdHandle', ctypes.windll.kernel32))
1326 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1328 WriteConsoleW = ctypes.WINFUNCTYPE(
1329 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1330 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1331 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1332 written = ctypes.wintypes.DWORD(0)
1334 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1335 FILE_TYPE_CHAR = 0x0002
1336 FILE_TYPE_REMOTE = 0x8000
1337 GetConsoleMode = ctypes.WINFUNCTYPE(
1338 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1339 ctypes.POINTER(ctypes.wintypes.DWORD))(
1340 (b'GetConsoleMode', ctypes.windll.kernel32))
1341 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1343 def not_a_console(handle):
1344 if handle == INVALID_HANDLE_VALUE or handle is None:
1346 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1347 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1349 if not_a_console(h):
1352 def next_nonbmp_pos(s):
1354 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1355 except StopIteration:
1359 count = min(next_nonbmp_pos(s), 1024)
1361 ret = WriteConsoleW(
1362 h, s, count if count else 2, ctypes.byref(written), None)
1364 raise OSError('Failed to write string')
1365 if not count: # We just wrote a non-BMP character
1366 assert written.value == 2
1369 assert written.value > 0
1370 s = s[written.value:]
1374 def write_string(s, out=None, encoding=None):
1377 assert type(s) == compat_str
1379 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1380 if _windows_write_string(s, out):
1383 if ('b' in getattr(out, 'mode', '') or
1384 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1385 byt = s.encode(encoding or preferredencoding(), 'ignore')
1387 elif hasattr(out, 'buffer'):
1388 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1389 byt = s.encode(enc, 'ignore')
1390 out.buffer.write(byt)
1396 def bytes_to_intlist(bs):
1399 if isinstance(bs[0], int): # Python 3
1402 return [ord(c) for c in bs]
1405 def intlist_to_bytes(xs):
1408 return compat_struct_pack('%dB' % len(xs), *xs)
1411 # Cross-platform file locking
1412 if sys.platform == 'win32':
1413 import ctypes.wintypes
1416 class OVERLAPPED(ctypes.Structure):
1418 ('Internal', ctypes.wintypes.LPVOID),
1419 ('InternalHigh', ctypes.wintypes.LPVOID),
1420 ('Offset', ctypes.wintypes.DWORD),
1421 ('OffsetHigh', ctypes.wintypes.DWORD),
1422 ('hEvent', ctypes.wintypes.HANDLE),
1425 kernel32 = ctypes.windll.kernel32
1426 LockFileEx = kernel32.LockFileEx
1427 LockFileEx.argtypes = [
1428 ctypes.wintypes.HANDLE, # hFile
1429 ctypes.wintypes.DWORD, # dwFlags
1430 ctypes.wintypes.DWORD, # dwReserved
1431 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1432 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1433 ctypes.POINTER(OVERLAPPED) # Overlapped
1435 LockFileEx.restype = ctypes.wintypes.BOOL
1436 UnlockFileEx = kernel32.UnlockFileEx
1437 UnlockFileEx.argtypes = [
1438 ctypes.wintypes.HANDLE, # hFile
1439 ctypes.wintypes.DWORD, # dwReserved
1440 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1441 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1442 ctypes.POINTER(OVERLAPPED) # Overlapped
1444 UnlockFileEx.restype = ctypes.wintypes.BOOL
1445 whole_low = 0xffffffff
1446 whole_high = 0x7fffffff
1448 def _lock_file(f, exclusive):
1449 overlapped = OVERLAPPED()
1450 overlapped.Offset = 0
1451 overlapped.OffsetHigh = 0
1452 overlapped.hEvent = 0
1453 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1454 handle = msvcrt.get_osfhandle(f.fileno())
1455 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1456 whole_low, whole_high, f._lock_file_overlapped_p):
1457 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1459 def _unlock_file(f):
1460 assert f._lock_file_overlapped_p
1461 handle = msvcrt.get_osfhandle(f.fileno())
1462 if not UnlockFileEx(handle, 0,
1463 whole_low, whole_high, f._lock_file_overlapped_p):
1464 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1467 # Some platforms, such as Jython, is missing fcntl
1471 def _lock_file(f, exclusive):
1472 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1474 def _unlock_file(f):
1475 fcntl.flock(f, fcntl.LOCK_UN)
1477 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1479 def _lock_file(f, exclusive):
1480 raise IOError(UNSUPPORTED_MSG)
1482 def _unlock_file(f):
1483 raise IOError(UNSUPPORTED_MSG)
1486 class locked_file(object):
1487 def __init__(self, filename, mode, encoding=None):
1488 assert mode in ['r', 'a', 'w']
1489 self.f = io.open(filename, mode, encoding=encoding)
1492 def __enter__(self):
1493 exclusive = self.mode != 'r'
1495 _lock_file(self.f, exclusive)
1501 def __exit__(self, etype, value, traceback):
1503 _unlock_file(self.f)
1510 def write(self, *args):
1511 return self.f.write(*args)
1513 def read(self, *args):
1514 return self.f.read(*args)
1517 def get_filesystem_encoding():
1518 encoding = sys.getfilesystemencoding()
1519 return encoding if encoding is not None else 'utf-8'
1522 def shell_quote(args):
1524 encoding = get_filesystem_encoding()
1526 if isinstance(a, bytes):
1527 # We may get a filename encoded with 'encodeFilename'
1528 a = a.decode(encoding)
1529 quoted_args.append(pipes.quote(a))
1530 return ' '.join(quoted_args)
1533 def smuggle_url(url, data):
1534 """ Pass additional data in a URL for internal use. """
1536 url, idata = unsmuggle_url(url, {})
1538 sdata = compat_urllib_parse_urlencode(
1539 {'__youtubedl_smuggle': json.dumps(data)})
1540 return url + '#' + sdata
1543 def unsmuggle_url(smug_url, default=None):
1544 if '#__youtubedl_smuggle' not in smug_url:
1545 return smug_url, default
1546 url, _, sdata = smug_url.rpartition('#')
1547 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1548 data = json.loads(jsond)
1552 def format_bytes(bytes):
1555 if type(bytes) is str:
1556 bytes = float(bytes)
1560 exponent = int(math.log(bytes, 1024.0))
1561 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1562 converted = float(bytes) / float(1024 ** exponent)
1563 return '%.2f%s' % (converted, suffix)
1566 def lookup_unit_table(unit_table, s):
1567 units_re = '|'.join(re.escape(u) for u in unit_table)
1569 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1572 num_str = m.group('num').replace(',', '.')
1573 mult = unit_table[m.group('unit')]
1574 return int(float(num_str) * mult)
1577 def parse_filesize(s):
1581 # The lower-case forms are of course incorrect and unofficial,
1582 # but we support those too
1599 'megabytes': 1000 ** 2,
1600 'mebibytes': 1024 ** 2,
1606 'gigabytes': 1000 ** 3,
1607 'gibibytes': 1024 ** 3,
1613 'terabytes': 1000 ** 4,
1614 'tebibytes': 1024 ** 4,
1620 'petabytes': 1000 ** 5,
1621 'pebibytes': 1024 ** 5,
1627 'exabytes': 1000 ** 6,
1628 'exbibytes': 1024 ** 6,
1634 'zettabytes': 1000 ** 7,
1635 'zebibytes': 1024 ** 7,
1641 'yottabytes': 1000 ** 8,
1642 'yobibytes': 1024 ** 8,
1645 return lookup_unit_table(_UNIT_TABLE, s)
1654 if re.match(r'^[\d,.]+$', s):
1655 return str_to_int(s)
1666 return lookup_unit_table(_UNIT_TABLE, s)
1669 def month_by_name(name, lang='en'):
1670 """ Return the number of a month by (locale-independently) English name """
1672 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1675 return month_names.index(name) + 1
1680 def month_by_abbreviation(abbrev):
1681 """ Return the number of a month by (locale-independently) English
1685 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1690 def fix_xml_ampersands(xml_str):
1691 """Replace all the '&' by '&' in XML"""
1693 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1698 def setproctitle(title):
1699 assert isinstance(title, compat_str)
1701 # ctypes in Jython is not complete
1702 # http://bugs.jython.org/issue2148
1703 if sys.platform.startswith('java'):
1707 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1711 # LoadLibrary in Windows Python 2.7.13 only expects
1712 # a bytestring, but since unicode_literals turns
1713 # every string into a unicode string, it fails.
1715 title_bytes = title.encode('utf-8')
1716 buf = ctypes.create_string_buffer(len(title_bytes))
1717 buf.value = title_bytes
1719 libc.prctl(15, buf, 0, 0, 0)
1720 except AttributeError:
1721 return # Strange libc, just skip this
1724 def remove_start(s, start):
1725 return s[len(start):] if s is not None and s.startswith(start) else s
1728 def remove_end(s, end):
1729 return s[:-len(end)] if s is not None and s.endswith(end) else s
1732 def remove_quotes(s):
1733 if s is None or len(s) < 2:
1735 for quote in ('"', "'", ):
1736 if s[0] == quote and s[-1] == quote:
1741 def url_basename(url):
1742 path = compat_urlparse.urlparse(url).path
1743 return path.strip('/').split('/')[-1]
1747 return re.match(r'https?://[^?#&]+/', url).group()
1750 def urljoin(base, path):
1751 if not isinstance(path, compat_str) or not path:
1753 if re.match(r'^(?:https?:)?//', path):
1755 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1757 return compat_urlparse.urljoin(base, path)
1760 class HEADRequest(compat_urllib_request.Request):
1761 def get_method(self):
1765 class PUTRequest(compat_urllib_request.Request):
1766 def get_method(self):
1770 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1773 v = getattr(v, get_attr, None)
1779 return int(v) * invscale // scale
1784 def str_or_none(v, default=None):
1785 return default if v is None else compat_str(v)
1788 def str_to_int(int_str):
1789 """ A more relaxed version of int_or_none """
1792 int_str = re.sub(r'[,\.\+]', '', int_str)
1796 def float_or_none(v, scale=1, invscale=1, default=None):
1800 return float(v) * invscale / scale
1805 def strip_or_none(v):
1806 return None if v is None else v.strip()
1809 def parse_duration(s):
1810 if not isinstance(s, compat_basestring):
1815 days, hours, mins, secs, ms = [None] * 5
1816 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1818 days, hours, mins, secs, ms = m.groups()
1823 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1826 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1829 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1832 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1835 days, hours, mins, secs, ms = m.groups()
1837 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1839 hours, mins = m.groups()
1845 duration += float(secs)
1847 duration += float(mins) * 60
1849 duration += float(hours) * 60 * 60
1851 duration += float(days) * 24 * 60 * 60
1853 duration += float(ms)
1857 def prepend_extension(filename, ext, expected_real_ext=None):
1858 name, real_ext = os.path.splitext(filename)
1860 '{0}.{1}{2}'.format(name, ext, real_ext)
1861 if not expected_real_ext or real_ext[1:] == expected_real_ext
1862 else '{0}.{1}'.format(filename, ext))
1865 def replace_extension(filename, ext, expected_real_ext=None):
1866 name, real_ext = os.path.splitext(filename)
1867 return '{0}.{1}'.format(
1868 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1872 def check_executable(exe, args=[]):
1873 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1874 args can be a list of arguments for a short output (like -version) """
1876 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1882 def get_exe_version(exe, args=['--version'],
1883 version_re=None, unrecognized='present'):
1884 """ Returns the version of the specified executable,
1885 or False if the executable is not present """
1887 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1888 # SIGTTOU if youtube-dl is run in the background.
1889 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1890 out, _ = subprocess.Popen(
1891 [encodeArgument(exe)] + args,
1892 stdin=subprocess.PIPE,
1893 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1896 if isinstance(out, bytes): # Python 2.x
1897 out = out.decode('ascii', 'ignore')
1898 return detect_exe_version(out, version_re, unrecognized)
1901 def detect_exe_version(output, version_re=None, unrecognized='present'):
1902 assert isinstance(output, compat_str)
1903 if version_re is None:
1904 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1905 m = re.search(version_re, output)
1912 class PagedList(object):
1914 # This is only useful for tests
1915 return len(self.getslice())
1918 class OnDemandPagedList(PagedList):
1919 def __init__(self, pagefunc, pagesize, use_cache=False):
1920 self._pagefunc = pagefunc
1921 self._pagesize = pagesize
1922 self._use_cache = use_cache
1926 def getslice(self, start=0, end=None):
1928 for pagenum in itertools.count(start // self._pagesize):
1929 firstid = pagenum * self._pagesize
1930 nextfirstid = pagenum * self._pagesize + self._pagesize
1931 if start >= nextfirstid:
1936 page_results = self._cache.get(pagenum)
1937 if page_results is None:
1938 page_results = list(self._pagefunc(pagenum))
1940 self._cache[pagenum] = page_results
1943 start % self._pagesize
1944 if firstid <= start < nextfirstid
1948 ((end - 1) % self._pagesize) + 1
1949 if (end is not None and firstid <= end <= nextfirstid)
1952 if startv != 0 or endv is not None:
1953 page_results = page_results[startv:endv]
1954 res.extend(page_results)
1956 # A little optimization - if current page is not "full", ie. does
1957 # not contain page_size videos then we can assume that this page
1958 # is the last one - there are no more ids on further pages -
1959 # i.e. no need to query again.
1960 if len(page_results) + startv < self._pagesize:
1963 # If we got the whole page, but the next page is not interesting,
1964 # break out early as well
1965 if end == nextfirstid:
1970 class InAdvancePagedList(PagedList):
1971 def __init__(self, pagefunc, pagecount, pagesize):
1972 self._pagefunc = pagefunc
1973 self._pagecount = pagecount
1974 self._pagesize = pagesize
1976 def getslice(self, start=0, end=None):
1978 start_page = start // self._pagesize
1980 self._pagecount if end is None else (end // self._pagesize + 1))
1981 skip_elems = start - start_page * self._pagesize
1982 only_more = None if end is None else end - start
1983 for pagenum in range(start_page, end_page):
1984 page = list(self._pagefunc(pagenum))
1986 page = page[skip_elems:]
1988 if only_more is not None:
1989 if len(page) < only_more:
1990 only_more -= len(page)
1992 page = page[:only_more]
1999 def uppercase_escape(s):
2000 unicode_escape = codecs.getdecoder('unicode_escape')
2002 r'\\U[0-9a-fA-F]{8}',
2003 lambda m: unicode_escape(m.group(0))[0],
2007 def lowercase_escape(s):
2008 unicode_escape = codecs.getdecoder('unicode_escape')
2010 r'\\u[0-9a-fA-F]{4}',
2011 lambda m: unicode_escape(m.group(0))[0],
2015 def escape_rfc3986(s):
2016 """Escape non-ASCII characters as suggested by RFC 3986"""
2017 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2018 s = s.encode('utf-8')
2019 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2022 def escape_url(url):
2023 """Escape URL as suggested by RFC 3986"""
2024 url_parsed = compat_urllib_parse_urlparse(url)
2025 return url_parsed._replace(
2026 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2027 path=escape_rfc3986(url_parsed.path),
2028 params=escape_rfc3986(url_parsed.params),
2029 query=escape_rfc3986(url_parsed.query),
2030 fragment=escape_rfc3986(url_parsed.fragment)
2034 def read_batch_urls(batch_fd):
2036 if not isinstance(url, compat_str):
2037 url = url.decode('utf-8', 'replace')
2038 BOM_UTF8 = '\xef\xbb\xbf'
2039 if url.startswith(BOM_UTF8):
2040 url = url[len(BOM_UTF8):]
2042 if url.startswith(('#', ';', ']')):
2046 with contextlib.closing(batch_fd) as fd:
2047 return [url for url in map(fixup, fd) if url]
2050 def urlencode_postdata(*args, **kargs):
2051 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2054 def update_url_query(url, query):
2057 parsed_url = compat_urlparse.urlparse(url)
2058 qs = compat_parse_qs(parsed_url.query)
2060 return compat_urlparse.urlunparse(parsed_url._replace(
2061 query=compat_urllib_parse_urlencode(qs, True)))
2064 def update_Request(req, url=None, data=None, headers={}, query={}):
2065 req_headers = req.headers.copy()
2066 req_headers.update(headers)
2067 req_data = data or req.data
2068 req_url = update_url_query(url or req.get_full_url(), query)
2069 req_get_method = req.get_method()
2070 if req_get_method == 'HEAD':
2071 req_type = HEADRequest
2072 elif req_get_method == 'PUT':
2073 req_type = PUTRequest
2075 req_type = compat_urllib_request.Request
2077 req_url, data=req_data, headers=req_headers,
2078 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2079 if hasattr(req, 'timeout'):
2080 new_req.timeout = req.timeout
2084 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2085 if isinstance(key_or_keys, (list, tuple)):
2086 for key in key_or_keys:
2087 if key not in d or d[key] is None or skip_false_values and not d[key]:
2091 return d.get(key_or_keys, default)
2094 def try_get(src, getter, expected_type=None):
2097 except (AttributeError, KeyError, TypeError, IndexError):
2100 if expected_type is None or isinstance(v, expected_type):
2104 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2105 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2117 TV_PARENTAL_GUIDELINES = {
2127 def parse_age_limit(s):
2129 return s if 0 <= s <= 21 else None
2130 if not isinstance(s, compat_basestring):
2132 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2134 return int(m.group('age'))
2136 return US_RATINGS[s]
2137 return TV_PARENTAL_GUIDELINES.get(s)
2140 def strip_jsonp(code):
2142 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2145 def js_to_json(code):
2146 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2147 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2149 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2150 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2155 if v in ('true', 'false', 'null'):
2157 elif v.startswith('/*') or v.startswith('//') or v == ',':
2160 if v[0] in ("'", '"'):
2161 v = re.sub(r'(?s)\\.|"', lambda m: {
2166 }.get(m.group(0), m.group(0)), v[1:-1])
2168 for regex, base in INTEGER_TABLE:
2169 im = re.match(regex, v)
2171 i = int(im.group(1), base)
2172 return '"%d":' % i if v.endswith(':') else '%d' % i
2176 return re.sub(r'''(?sx)
2177 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2178 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2179 {comment}|,(?={skip}[\]}}])|
2180 [a-zA-Z_][.a-zA-Z_0-9]*|
2181 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2183 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2186 def qualities(quality_ids):
2187 """ Get a numeric quality value out of a list of possible values """
2190 return quality_ids.index(qid)
2196 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2199 def limit_length(s, length):
2200 """ Add ellipses to overly long strings """
2205 return s[:length - len(ELLIPSES)] + ELLIPSES
2209 def version_tuple(v):
2210 return tuple(int(e) for e in re.split(r'[-.]', v))
2213 def is_outdated_version(version, limit, assume_new=True):
2215 return not assume_new
2217 return version_tuple(version) < version_tuple(limit)
2219 return not assume_new
2222 def ytdl_is_updateable():
2223 """ Returns if youtube-dl can be updated with -U """
2224 from zipimport import zipimporter
2226 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2229 def args_to_str(args):
2230 # Get a short string representation for a subprocess command
2231 return ' '.join(compat_shlex_quote(a) for a in args)
2234 def error_to_compat_str(err):
2236 # On python 2 error byte string must be decoded with proper
2237 # encoding rather than ascii
2238 if sys.version_info[0] < 3:
2239 err_str = err_str.decode(preferredencoding())
2243 def mimetype2ext(mt):
2249 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2250 # it's the most popular one
2251 'audio/mpeg': 'mp3',
2256 _, _, res = mt.rpartition('/')
2257 res = res.split(';')[0].strip().lower()
2261 'smptett+xml': 'tt',
2267 'x-mp4-fragmented': 'mp4',
2270 'x-mpegurl': 'm3u8',
2271 'vnd.apple.mpegurl': 'm3u8',
2276 'vnd.ms-sstr+xml': 'ism',
2281 def parse_codecs(codecs_str):
2282 # http://tools.ietf.org/html/rfc6381
2285 splited_codecs = list(filter(None, map(
2286 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2287 vcodec, acodec = None, None
2288 for full_codec in splited_codecs:
2289 codec = full_codec.split('.')[0]
2290 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2293 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2297 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2298 if not vcodec and not acodec:
2299 if len(splited_codecs) == 2:
2304 elif len(splited_codecs) == 1:
2311 'vcodec': vcodec or 'none',
2312 'acodec': acodec or 'none',
2317 def urlhandle_detect_ext(url_handle):
2318 getheader = url_handle.headers.get
2320 cd = getheader('Content-Disposition')
2322 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2324 e = determine_ext(m.group('filename'), default_ext=None)
2328 return mimetype2ext(getheader('Content-Type'))
2331 def encode_data_uri(data, mime_type):
2332 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2335 def age_restricted(content_limit, age_limit):
2336 """ Returns True iff the content should be blocked """
2338 if age_limit is None: # No limit set
2340 if content_limit is None:
2341 return False # Content available for everyone
2342 return age_limit < content_limit
2345 def is_html(first_bytes):
2346 """ Detect whether a file contains HTML by examining its first bytes. """
2349 (b'\xef\xbb\xbf', 'utf-8'),
2350 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2351 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2352 (b'\xff\xfe', 'utf-16-le'),
2353 (b'\xfe\xff', 'utf-16-be'),
2355 for bom, enc in BOMS:
2356 if first_bytes.startswith(bom):
2357 s = first_bytes[len(bom):].decode(enc, 'replace')
2360 s = first_bytes.decode('utf-8', 'replace')
2362 return re.match(r'^\s*<', s)
2365 def determine_protocol(info_dict):
2366 protocol = info_dict.get('protocol')
2367 if protocol is not None:
2370 url = info_dict['url']
2371 if url.startswith('rtmp'):
2373 elif url.startswith('mms'):
2375 elif url.startswith('rtsp'):
2378 ext = determine_ext(url)
2384 return compat_urllib_parse_urlparse(url).scheme
2387 def render_table(header_row, data):
2388 """ Render a list of rows, each as a list of values """
2389 table = [header_row] + data
2390 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2391 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2392 return '\n'.join(format_str % tuple(row) for row in table)
2395 def _match_one(filter_part, dct):
2396 COMPARISON_OPERATORS = {
2404 operator_rex = re.compile(r'''(?x)\s*
2406 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2408 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2409 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2410 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2413 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2414 m = operator_rex.search(filter_part)
2416 op = COMPARISON_OPERATORS[m.group('op')]
2417 actual_value = dct.get(m.group('key'))
2418 if (m.group('quotedstrval') is not None or
2419 m.group('strval') is not None or
2420 # If the original field is a string and matching comparisonvalue is
2421 # a number we should respect the origin of the original field
2422 # and process comparison value as a string (see
2423 # https://github.com/rg3/youtube-dl/issues/11082).
2424 actual_value is not None and m.group('intval') is not None and
2425 isinstance(actual_value, compat_str)):
2426 if m.group('op') not in ('=', '!='):
2428 'Operator %s does not support string values!' % m.group('op'))
2429 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2430 quote = m.group('quote')
2431 if quote is not None:
2432 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2435 comparison_value = int(m.group('intval'))
2437 comparison_value = parse_filesize(m.group('intval'))
2438 if comparison_value is None:
2439 comparison_value = parse_filesize(m.group('intval') + 'B')
2440 if comparison_value is None:
2442 'Invalid integer value %r in filter part %r' % (
2443 m.group('intval'), filter_part))
2444 if actual_value is None:
2445 return m.group('none_inclusive')
2446 return op(actual_value, comparison_value)
2449 '': lambda v: v is not None,
2450 '!': lambda v: v is None,
2452 operator_rex = re.compile(r'''(?x)\s*
2453 (?P<op>%s)\s*(?P<key>[a-z_]+)
2455 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2456 m = operator_rex.search(filter_part)
2458 op = UNARY_OPERATORS[m.group('op')]
2459 actual_value = dct.get(m.group('key'))
2460 return op(actual_value)
2462 raise ValueError('Invalid filter part %r' % filter_part)
2465 def match_str(filter_str, dct):
2466 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2469 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2472 def match_filter_func(filter_str):
2473 def _match_func(info_dict):
2474 if match_str(filter_str, info_dict):
2477 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2478 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2482 def parse_dfxp_time_expr(time_expr):
2486 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2488 return float(mobj.group('time_offset'))
2490 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2492 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2495 def srt_subtitles_timecode(seconds):
2496 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2499 def dfxp2srt(dfxp_data):
2500 _x = functools.partial(xpath_with_ns, ns_map={
2501 'ttml': 'http://www.w3.org/ns/ttml',
2502 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2503 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2506 class TTMLPElementParser(object):
2509 def start(self, tag, attrib):
2510 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2516 def data(self, data):
2520 return self.out.strip()
2522 def parse_node(node):
2523 target = TTMLPElementParser()
2524 parser = xml.etree.ElementTree.XMLParser(target=target)
2525 parser.feed(xml.etree.ElementTree.tostring(node))
2526 return parser.close()
2528 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2530 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2533 raise ValueError('Invalid dfxp/TTML subtitle')
2535 for para, index in zip(paras, itertools.count(1)):
2536 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2537 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2538 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2539 if begin_time is None:
2544 end_time = begin_time + dur
2545 out.append('%d\n%s --> %s\n%s\n\n' % (
2547 srt_subtitles_timecode(begin_time),
2548 srt_subtitles_timecode(end_time),
2554 def cli_option(params, command_option, param):
2555 param = params.get(param)
2557 param = compat_str(param)
2558 return [command_option, param] if param is not None else []
2561 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2562 param = params.get(param)
2563 assert isinstance(param, bool)
2565 return [command_option + separator + (true_value if param else false_value)]
2566 return [command_option, true_value if param else false_value]
2569 def cli_valueless_option(params, command_option, param, expected_value=True):
2570 param = params.get(param)
2571 return [command_option] if param == expected_value else []
2574 def cli_configuration_args(params, param, default=[]):
2575 ex_args = params.get(param)
2578 assert isinstance(ex_args, list)
2582 class ISO639Utils(object):
2583 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2772 def short2long(cls, code):
2773 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2774 return cls._lang_map.get(code[:2])
2777 def long2short(cls, code):
2778 """Convert language code from ISO 639-2/T to ISO 639-1"""
2779 for short_name, long_name in cls._lang_map.items():
2780 if long_name == code:
2784 class ISO3166Utils(object):
2785 # From http://data.okfn.org/data/core/country-list
2787 'AF': 'Afghanistan',
2788 'AX': 'Åland Islands',
2791 'AS': 'American Samoa',
2796 'AG': 'Antigua and Barbuda',
2813 'BO': 'Bolivia, Plurinational State of',
2814 'BQ': 'Bonaire, Sint Eustatius and Saba',
2815 'BA': 'Bosnia and Herzegovina',
2817 'BV': 'Bouvet Island',
2819 'IO': 'British Indian Ocean Territory',
2820 'BN': 'Brunei Darussalam',
2822 'BF': 'Burkina Faso',
2828 'KY': 'Cayman Islands',
2829 'CF': 'Central African Republic',
2833 'CX': 'Christmas Island',
2834 'CC': 'Cocos (Keeling) Islands',
2838 'CD': 'Congo, the Democratic Republic of the',
2839 'CK': 'Cook Islands',
2841 'CI': 'Côte d\'Ivoire',
2846 'CZ': 'Czech Republic',
2850 'DO': 'Dominican Republic',
2853 'SV': 'El Salvador',
2854 'GQ': 'Equatorial Guinea',
2858 'FK': 'Falkland Islands (Malvinas)',
2859 'FO': 'Faroe Islands',
2863 'GF': 'French Guiana',
2864 'PF': 'French Polynesia',
2865 'TF': 'French Southern Territories',
2880 'GW': 'Guinea-Bissau',
2883 'HM': 'Heard Island and McDonald Islands',
2884 'VA': 'Holy See (Vatican City State)',
2891 'IR': 'Iran, Islamic Republic of',
2894 'IM': 'Isle of Man',
2904 'KP': 'Korea, Democratic People\'s Republic of',
2905 'KR': 'Korea, Republic of',
2908 'LA': 'Lao People\'s Democratic Republic',
2914 'LI': 'Liechtenstein',
2918 'MK': 'Macedonia, the Former Yugoslav Republic of',
2925 'MH': 'Marshall Islands',
2931 'FM': 'Micronesia, Federated States of',
2932 'MD': 'Moldova, Republic of',
2943 'NL': 'Netherlands',
2944 'NC': 'New Caledonia',
2945 'NZ': 'New Zealand',
2950 'NF': 'Norfolk Island',
2951 'MP': 'Northern Mariana Islands',
2956 'PS': 'Palestine, State of',
2958 'PG': 'Papua New Guinea',
2961 'PH': 'Philippines',
2965 'PR': 'Puerto Rico',
2969 'RU': 'Russian Federation',
2971 'BL': 'Saint Barthélemy',
2972 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2973 'KN': 'Saint Kitts and Nevis',
2974 'LC': 'Saint Lucia',
2975 'MF': 'Saint Martin (French part)',
2976 'PM': 'Saint Pierre and Miquelon',
2977 'VC': 'Saint Vincent and the Grenadines',
2980 'ST': 'Sao Tome and Principe',
2981 'SA': 'Saudi Arabia',
2985 'SL': 'Sierra Leone',
2987 'SX': 'Sint Maarten (Dutch part)',
2990 'SB': 'Solomon Islands',
2992 'ZA': 'South Africa',
2993 'GS': 'South Georgia and the South Sandwich Islands',
2994 'SS': 'South Sudan',
2999 'SJ': 'Svalbard and Jan Mayen',
3002 'CH': 'Switzerland',
3003 'SY': 'Syrian Arab Republic',
3004 'TW': 'Taiwan, Province of China',
3006 'TZ': 'Tanzania, United Republic of',
3008 'TL': 'Timor-Leste',
3012 'TT': 'Trinidad and Tobago',
3015 'TM': 'Turkmenistan',
3016 'TC': 'Turks and Caicos Islands',
3020 'AE': 'United Arab Emirates',
3021 'GB': 'United Kingdom',
3022 'US': 'United States',
3023 'UM': 'United States Minor Outlying Islands',
3027 'VE': 'Venezuela, Bolivarian Republic of',
3029 'VG': 'Virgin Islands, British',
3030 'VI': 'Virgin Islands, U.S.',
3031 'WF': 'Wallis and Futuna',
3032 'EH': 'Western Sahara',
3039 def short2full(cls, code):
3040 """Convert an ISO 3166-2 country code to the corresponding full name"""
3041 return cls._country_map.get(code.upper())
3044 class GeoUtils(object):
3045 # Major IPv4 address blocks per country
3047 'AD': '85.94.160.0/19',
3048 'AE': '94.200.0.0/13',
3049 'AF': '149.54.0.0/17',
3050 'AG': '209.59.64.0/18',
3051 'AI': '204.14.248.0/21',
3052 'AL': '46.99.0.0/16',
3053 'AM': '46.70.0.0/15',
3054 'AO': '105.168.0.0/13',
3055 'AP': '159.117.192.0/21',
3056 'AR': '181.0.0.0/12',
3057 'AS': '202.70.112.0/20',
3058 'AT': '84.112.0.0/13',
3059 'AU': '1.128.0.0/11',
3060 'AW': '181.41.0.0/18',
3061 'AZ': '5.191.0.0/16',
3062 'BA': '31.176.128.0/17',
3063 'BB': '65.48.128.0/17',
3064 'BD': '114.130.0.0/16',
3066 'BF': '129.45.128.0/17',
3067 'BG': '95.42.0.0/15',
3068 'BH': '37.131.0.0/17',
3069 'BI': '154.117.192.0/18',
3070 'BJ': '137.255.0.0/16',
3071 'BL': '192.131.134.0/24',
3072 'BM': '196.12.64.0/18',
3073 'BN': '156.31.0.0/16',
3074 'BO': '161.56.0.0/16',
3075 'BQ': '161.0.80.0/20',
3076 'BR': '152.240.0.0/12',
3077 'BS': '24.51.64.0/18',
3078 'BT': '119.2.96.0/19',
3079 'BW': '168.167.0.0/16',
3080 'BY': '178.120.0.0/13',
3081 'BZ': '179.42.192.0/18',
3082 'CA': '99.224.0.0/11',
3083 'CD': '41.243.0.0/16',
3084 'CF': '196.32.200.0/21',
3085 'CG': '197.214.128.0/17',
3086 'CH': '85.0.0.0/13',
3087 'CI': '154.232.0.0/14',
3088 'CK': '202.65.32.0/19',
3089 'CL': '152.172.0.0/14',
3090 'CM': '165.210.0.0/15',
3091 'CN': '36.128.0.0/10',
3092 'CO': '181.240.0.0/12',
3093 'CR': '201.192.0.0/12',
3094 'CU': '152.206.0.0/15',
3095 'CV': '165.90.96.0/19',
3096 'CW': '190.88.128.0/17',
3097 'CY': '46.198.0.0/15',
3098 'CZ': '88.100.0.0/14',
3100 'DJ': '197.241.0.0/17',
3101 'DK': '87.48.0.0/12',
3102 'DM': '192.243.48.0/20',
3103 'DO': '152.166.0.0/15',
3104 'DZ': '41.96.0.0/12',
3105 'EC': '186.68.0.0/15',
3106 'EE': '90.190.0.0/15',
3107 'EG': '156.160.0.0/11',
3108 'ER': '196.200.96.0/20',
3109 'ES': '88.0.0.0/11',
3110 'ET': '196.188.0.0/14',
3111 'EU': '2.16.0.0/13',
3112 'FI': '91.152.0.0/13',
3113 'FJ': '144.120.0.0/16',
3114 'FM': '119.252.112.0/20',
3115 'FO': '88.85.32.0/19',
3117 'GA': '41.158.0.0/15',
3119 'GD': '74.122.88.0/21',
3120 'GE': '31.146.0.0/16',
3121 'GF': '161.22.64.0/18',
3122 'GG': '62.68.160.0/19',
3123 'GH': '45.208.0.0/14',
3124 'GI': '85.115.128.0/19',
3125 'GL': '88.83.0.0/19',
3126 'GM': '160.182.0.0/15',
3127 'GN': '197.149.192.0/18',
3128 'GP': '104.250.0.0/19',
3129 'GQ': '105.235.224.0/20',
3130 'GR': '94.64.0.0/13',
3131 'GT': '168.234.0.0/16',
3132 'GU': '168.123.0.0/16',
3133 'GW': '197.214.80.0/20',
3134 'GY': '181.41.64.0/18',
3135 'HK': '113.252.0.0/14',
3136 'HN': '181.210.0.0/16',
3137 'HR': '93.136.0.0/13',
3138 'HT': '148.102.128.0/17',
3139 'HU': '84.0.0.0/14',
3140 'ID': '39.192.0.0/10',
3141 'IE': '87.32.0.0/12',
3142 'IL': '79.176.0.0/13',
3143 'IM': '5.62.80.0/20',
3144 'IN': '117.192.0.0/10',
3145 'IO': '203.83.48.0/21',
3146 'IQ': '37.236.0.0/14',
3147 'IR': '2.176.0.0/12',
3148 'IS': '82.221.0.0/16',
3149 'IT': '79.0.0.0/10',
3150 'JE': '87.244.64.0/18',
3151 'JM': '72.27.0.0/17',
3152 'JO': '176.29.0.0/16',
3153 'JP': '126.0.0.0/8',
3154 'KE': '105.48.0.0/12',
3155 'KG': '158.181.128.0/17',
3156 'KH': '36.37.128.0/17',
3157 'KI': '103.25.140.0/22',
3158 'KM': '197.255.224.0/20',
3159 'KN': '198.32.32.0/19',
3160 'KP': '175.45.176.0/22',
3161 'KR': '175.192.0.0/10',
3162 'KW': '37.36.0.0/14',
3163 'KY': '64.96.0.0/15',
3164 'KZ': '2.72.0.0/13',
3165 'LA': '115.84.64.0/18',
3166 'LB': '178.135.0.0/16',
3167 'LC': '192.147.231.0/24',
3168 'LI': '82.117.0.0/19',
3169 'LK': '112.134.0.0/15',
3170 'LR': '41.86.0.0/19',
3171 'LS': '129.232.0.0/17',
3172 'LT': '78.56.0.0/13',
3173 'LU': '188.42.0.0/16',
3174 'LV': '46.109.0.0/16',
3175 'LY': '41.252.0.0/14',
3176 'MA': '105.128.0.0/11',
3177 'MC': '88.209.64.0/18',
3178 'MD': '37.246.0.0/16',
3179 'ME': '178.175.0.0/17',
3180 'MF': '74.112.232.0/21',
3181 'MG': '154.126.0.0/17',
3182 'MH': '117.103.88.0/21',
3183 'MK': '77.28.0.0/15',
3184 'ML': '154.118.128.0/18',
3185 'MM': '37.111.0.0/17',
3186 'MN': '49.0.128.0/17',
3187 'MO': '60.246.0.0/16',
3188 'MP': '202.88.64.0/20',
3189 'MQ': '109.203.224.0/19',
3190 'MR': '41.188.64.0/18',
3191 'MS': '208.90.112.0/22',
3192 'MT': '46.11.0.0/16',
3193 'MU': '105.16.0.0/12',
3194 'MV': '27.114.128.0/18',
3195 'MW': '105.234.0.0/16',
3196 'MX': '187.192.0.0/11',
3197 'MY': '175.136.0.0/13',
3198 'MZ': '197.218.0.0/15',
3199 'NA': '41.182.0.0/16',
3200 'NC': '101.101.0.0/18',
3201 'NE': '197.214.0.0/18',
3202 'NF': '203.17.240.0/22',
3203 'NG': '105.112.0.0/12',
3204 'NI': '186.76.0.0/15',
3205 'NL': '145.96.0.0/11',
3206 'NO': '84.208.0.0/13',
3207 'NP': '36.252.0.0/15',
3208 'NR': '203.98.224.0/19',
3209 'NU': '49.156.48.0/22',
3210 'NZ': '49.224.0.0/14',
3211 'OM': '5.36.0.0/15',
3212 'PA': '186.72.0.0/15',
3213 'PE': '186.160.0.0/14',
3214 'PF': '123.50.64.0/18',
3215 'PG': '124.240.192.0/19',
3216 'PH': '49.144.0.0/13',
3217 'PK': '39.32.0.0/11',
3218 'PL': '83.0.0.0/11',
3219 'PM': '70.36.0.0/20',
3220 'PR': '66.50.0.0/16',
3221 'PS': '188.161.0.0/16',
3222 'PT': '85.240.0.0/13',
3223 'PW': '202.124.224.0/20',
3224 'PY': '181.120.0.0/14',
3225 'QA': '37.210.0.0/15',
3226 'RE': '139.26.0.0/16',
3227 'RO': '79.112.0.0/13',
3228 'RS': '178.220.0.0/14',
3229 'RU': '5.136.0.0/13',
3230 'RW': '105.178.0.0/15',
3231 'SA': '188.48.0.0/13',
3232 'SB': '202.1.160.0/19',
3233 'SC': '154.192.0.0/11',
3234 'SD': '154.96.0.0/13',
3235 'SE': '78.64.0.0/12',
3236 'SG': '152.56.0.0/14',
3237 'SI': '188.196.0.0/14',
3238 'SK': '78.98.0.0/15',
3239 'SL': '197.215.0.0/17',
3240 'SM': '89.186.32.0/19',
3241 'SN': '41.82.0.0/15',
3242 'SO': '197.220.64.0/19',
3243 'SR': '186.179.128.0/17',
3244 'SS': '105.235.208.0/21',
3245 'ST': '197.159.160.0/19',
3246 'SV': '168.243.0.0/16',
3247 'SX': '190.102.0.0/20',
3249 'SZ': '41.84.224.0/19',
3250 'TC': '65.255.48.0/20',
3251 'TD': '154.68.128.0/19',
3252 'TG': '196.168.0.0/14',
3253 'TH': '171.96.0.0/13',
3254 'TJ': '85.9.128.0/18',
3255 'TK': '27.96.24.0/21',
3256 'TL': '180.189.160.0/20',
3257 'TM': '95.85.96.0/19',
3258 'TN': '197.0.0.0/11',
3259 'TO': '175.176.144.0/21',
3260 'TR': '78.160.0.0/11',
3261 'TT': '186.44.0.0/15',
3262 'TV': '202.2.96.0/19',
3263 'TW': '120.96.0.0/11',
3264 'TZ': '156.156.0.0/14',
3265 'UA': '93.72.0.0/13',
3266 'UG': '154.224.0.0/13',
3268 'UY': '167.56.0.0/13',
3269 'UZ': '82.215.64.0/18',
3270 'VA': '212.77.0.0/19',
3271 'VC': '24.92.144.0/20',
3272 'VE': '186.88.0.0/13',
3273 'VG': '172.103.64.0/18',
3274 'VI': '146.226.0.0/16',
3275 'VN': '14.160.0.0/11',
3276 'VU': '202.80.32.0/20',
3277 'WF': '117.20.32.0/21',
3278 'WS': '202.4.32.0/19',
3279 'YE': '134.35.0.0/16',
3280 'YT': '41.242.116.0/22',
3281 'ZA': '41.0.0.0/11',
3282 'ZM': '165.56.0.0/13',
3283 'ZW': '41.85.192.0/19',
3287 def random_ipv4(cls, code):
3288 block = cls._country_ip_map.get(code.upper())
3291 addr, preflen = block.split('/')
3292 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3293 addr_max = addr_min | (0xffffffff >> int(preflen))
3294 return compat_str(socket.inet_ntoa(
3295 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3298 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3299 def __init__(self, proxies=None):
3300 # Set default handlers
3301 for type in ('http', 'https'):
3302 setattr(self, '%s_open' % type,
3303 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3304 meth(r, proxy, type))
3305 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3307 def proxy_open(self, req, proxy, type):
3308 req_proxy = req.headers.get('Ytdl-request-proxy')
3309 if req_proxy is not None:
3311 del req.headers['Ytdl-request-proxy']
3313 if proxy == '__noproxy__':
3314 return None # No Proxy
3315 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3316 req.add_header('Ytdl-socks-proxy', proxy)
3317 # youtube-dl's http/https handlers do wrapping the socket with socks
3319 return compat_urllib_request.ProxyHandler.proxy_open(
3320 self, req, proxy, type)
3323 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3324 # released into Public Domain
3325 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3327 def long_to_bytes(n, blocksize=0):
3328 """long_to_bytes(n:long, blocksize:int) : string
3329 Convert a long integer to a byte string.
3331 If optional blocksize is given and greater than zero, pad the front of the
3332 byte string with binary zeros so that the length is a multiple of
3335 # after much testing, this algorithm was deemed to be the fastest
3339 s = compat_struct_pack('>I', n & 0xffffffff) + s
3341 # strip off leading zeros
3342 for i in range(len(s)):
3343 if s[i] != b'\000'[0]:
3346 # only happens when n == 0
3350 # add back some pad bytes. this could be done more efficiently w.r.t. the
3351 # de-padding being done above, but sigh...
3352 if blocksize > 0 and len(s) % blocksize:
3353 s = (blocksize - len(s) % blocksize) * b'\000' + s
3357 def bytes_to_long(s):
3358 """bytes_to_long(string) : long
3359 Convert a byte string to a long integer.
3361 This is (essentially) the inverse of long_to_bytes().
3366 extra = (4 - length % 4)
3367 s = b'\000' * extra + s
3368 length = length + extra
3369 for i in range(0, length, 4):
3370 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3374 def ohdave_rsa_encrypt(data, exponent, modulus):
3376 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3379 data: data to encrypt, bytes-like object
3380 exponent, modulus: parameter e and N of RSA algorithm, both integer
3381 Output: hex string of encrypted data
3383 Limitation: supports one block encryption only
3386 payload = int(binascii.hexlify(data[::-1]), 16)
3387 encrypted = pow(payload, exponent, modulus)
3388 return '%x' % encrypted
3391 def pkcs1pad(data, length):
3393 Padding input data with PKCS#1 scheme
3395 @param {int[]} data input data
3396 @param {int} length target length
3397 @returns {int[]} padded data
3399 if len(data) > length - 11:
3400 raise ValueError('Input data too long for PKCS#1 padding')
3402 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3403 return [0, 2] + pseudo_random + [0] + data
3406 def encode_base_n(num, n, table=None):
3407 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3409 table = FULL_TABLE[:n]
3412 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3419 ret = table[num % n] + ret
3424 def decode_packed_codes(code):
3425 mobj = re.search(PACKED_CODES_RE, code)
3426 obfucasted_code, base, count, symbols = mobj.groups()
3429 symbols = symbols.split('|')
3434 base_n_count = encode_base_n(count, base)
3435 symbol_table[base_n_count] = symbols[count] or base_n_count
3438 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3442 def parse_m3u8_attributes(attrib):
3444 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3445 if val.startswith('"'):
3451 def urshift(val, n):
3452 return val >> n if val >= 0 else (val + 0x100000000) >> n
3455 # Based on png2str() written by @gdkchan and improved by @yokrysty
3456 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3457 def decode_png(png_data):
3458 # Reference: https://www.w3.org/TR/PNG/
3459 header = png_data[8:]
3461 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3462 raise IOError('Not a valid PNG file.')
3464 int_map = {1: '>B', 2: '>H', 4: '>I'}
3465 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3470 length = unpack_integer(header[:4])
3473 chunk_type = header[:4]
3476 chunk_data = header[:length]
3477 header = header[length:]
3479 header = header[4:] # Skip CRC
3487 ihdr = chunks[0]['data']
3489 width = unpack_integer(ihdr[:4])
3490 height = unpack_integer(ihdr[4:8])
3494 for chunk in chunks:
3495 if chunk['type'] == b'IDAT':
3496 idat += chunk['data']
3499 raise IOError('Unable to read PNG data.')
3501 decompressed_data = bytearray(zlib.decompress(idat))
3506 def _get_pixel(idx):
3511 for y in range(height):
3512 basePos = y * (1 + stride)
3513 filter_type = decompressed_data[basePos]
3517 pixels.append(current_row)
3519 for x in range(stride):
3520 color = decompressed_data[1 + basePos + x]
3521 basex = y * stride + x
3526 left = _get_pixel(basex - 3)
3528 up = _get_pixel(basex - stride)
3530 if filter_type == 1: # Sub
3531 color = (color + left) & 0xff
3532 elif filter_type == 2: # Up
3533 color = (color + up) & 0xff
3534 elif filter_type == 3: # Average
3535 color = (color + ((left + up) >> 1)) & 0xff
3536 elif filter_type == 4: # Paeth
3542 c = _get_pixel(basex - stride - 3)
3550 if pa <= pb and pa <= pc:
3551 color = (color + a) & 0xff
3553 color = (color + b) & 0xff
3555 color = (color + c) & 0xff
3557 current_row.append(color)
3559 return width, height, pixels
3562 def write_xattr(path, key, value):
3563 # This mess below finds the best xattr tool for the job
3565 # try the pyxattr module...
3568 if hasattr(xattr, 'set'): # pyxattr
3569 # Unicode arguments are not supported in python-pyxattr until
3571 # See https://github.com/rg3/youtube-dl/issues/5498
3572 pyxattr_required_version = '0.5.0'
3573 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3574 # TODO: fallback to CLI tools
3575 raise XAttrUnavailableError(
3576 'python-pyxattr is detected but is too old. '
3577 'youtube-dl requires %s or above while your version is %s. '
3578 'Falling back to other xattr implementations' % (
3579 pyxattr_required_version, xattr.__version__))
3581 setxattr = xattr.set
3583 setxattr = xattr.setxattr
3586 setxattr(path, key, value)
3587 except EnvironmentError as e:
3588 raise XAttrMetadataError(e.errno, e.strerror)
3591 if compat_os_name == 'nt':
3592 # Write xattrs to NTFS Alternate Data Streams:
3593 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3594 assert ':' not in key
3595 assert os.path.exists(path)
3597 ads_fn = path + ':' + key
3599 with open(ads_fn, 'wb') as f:
3601 except EnvironmentError as e:
3602 raise XAttrMetadataError(e.errno, e.strerror)
3604 user_has_setfattr = check_executable('setfattr', ['--version'])
3605 user_has_xattr = check_executable('xattr', ['-h'])
3607 if user_has_setfattr or user_has_xattr:
3609 value = value.decode('utf-8')
3610 if user_has_setfattr:
3611 executable = 'setfattr'
3612 opts = ['-n', key, '-v', value]
3613 elif user_has_xattr:
3614 executable = 'xattr'
3615 opts = ['-w', key, value]
3617 cmd = ([encodeFilename(executable, True)] +
3618 [encodeArgument(o) for o in opts] +
3619 [encodeFilename(path, True)])
3622 p = subprocess.Popen(
3623 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3624 except EnvironmentError as e:
3625 raise XAttrMetadataError(e.errno, e.strerror)
3626 stdout, stderr = p.communicate()
3627 stderr = stderr.decode('utf-8', 'replace')
3628 if p.returncode != 0:
3629 raise XAttrMetadataError(p.returncode, stderr)
3632 # On Unix, and can't find pyxattr, setfattr, or xattr.
3633 if sys.platform.startswith('linux'):
3634 raise XAttrUnavailableError(
3635 "Couldn't find a tool to set the xattrs. "
3636 "Install either the python 'pyxattr' or 'xattr' "
3637 "modules, or the GNU 'attr' package "
3638 "(which contains the 'setfattr' tool).")
3640 raise XAttrUnavailableError(
3641 "Couldn't find a tool to set the xattrs. "
3642 "Install either the python 'xattr' module, "
3643 "or the 'xattr' binary.")