4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
42 compat_html_entities_html5,
48 compat_socket_create_connection,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
96 ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
101 'en': ENGLISH_MONTH_NAMES,
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
120 'f4f', 'f4m', 'm3u8', 'smil')
122 # needed for sanitizing filenames in restricted mode
123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
148 '%Y-%m-%d %H:%M:%S.%f',
151 '%Y-%m-%dT%H:%M:%SZ',
152 '%Y-%m-%dT%H:%M:%S.%fZ',
153 '%Y-%m-%dT%H:%M:%S.%f0Z',
155 '%Y-%m-%dT%H:%M:%S.%f',
158 '%b %d %Y at %H:%M:%S',
161 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
162 DATE_FORMATS_DAY_FIRST.extend([
171 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
172 DATE_FORMATS_MONTH_FIRST.extend([
180 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
183 def preferredencoding():
184 """Get preferred encoding.
186 Returns the best encoding scheme for the system, based on
187 locale.getpreferredencoding() and some further tweaks.
190 pref = locale.getpreferredencoding()
198 def write_json_file(obj, fn):
199 """ Encode obj as JSON and write it to fn, atomically if possible """
201 fn = encodeFilename(fn)
202 if sys.version_info < (3, 0) and sys.platform != 'win32':
203 encoding = get_filesystem_encoding()
204 # os.path.basename returns a bytes object, but NamedTemporaryFile
205 # will fail if the filename contains non ascii characters unless we
206 # use a unicode object
207 path_basename = lambda f: os.path.basename(fn).decode(encoding)
208 # the same for os.path.dirname
209 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
211 path_basename = os.path.basename
212 path_dirname = os.path.dirname
216 'prefix': path_basename(fn) + '.',
217 'dir': path_dirname(fn),
221 # In Python 2.x, json.dump expects a bytestream.
222 # In Python 3.x, it writes to a character stream
223 if sys.version_info < (3, 0):
231 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
236 if sys.platform == 'win32':
237 # Need to remove existing file on Windows, else os.rename raises
238 # WindowsError or FileExistsError.
243 os.rename(tf.name, fn)
252 if sys.version_info >= (2, 7):
253 def find_xpath_attr(node, xpath, key, val=None):
254 """ Find the xpath xpath[@key=val] """
255 assert re.match(r'^[a-zA-Z_-]+$', key)
256 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
257 return node.find(expr)
259 def find_xpath_attr(node, xpath, key, val=None):
260 for f in node.findall(compat_xpath(xpath)):
261 if key not in f.attrib:
263 if val is None or f.attrib.get(key) == val:
267 # On python2.6 the xml.etree.ElementTree.Element methods don't support
268 # the namespace parameter
271 def xpath_with_ns(path, ns_map):
272 components = [c.split(':') for c in path.split('/')]
276 replaced.append(c[0])
279 replaced.append('{%s}%s' % (ns_map[ns], tag))
280 return '/'.join(replaced)
283 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
284 def _find_xpath(xpath):
285 return node.find(compat_xpath(xpath))
287 if isinstance(xpath, (str, compat_str)):
288 n = _find_xpath(xpath)
296 if default is not NO_DEFAULT:
299 name = xpath if name is None else name
300 raise ExtractorError('Could not find XML element %s' % name)
306 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
307 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
308 if n is None or n == default:
311 if default is not NO_DEFAULT:
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element\'s text %s' % name)
321 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
322 n = find_xpath_attr(node, xpath, key)
324 if default is not NO_DEFAULT:
327 name = '%s[@%s]' % (xpath, key) if name is None else name
328 raise ExtractorError('Could not find XML attribute %s' % name)
334 def get_element_by_id(id, html):
335 """Return the content of the tag with the specified ID in the passed HTML document"""
336 return get_element_by_attribute('id', id, html)
339 def get_element_by_class(class_name, html):
340 """Return the content of the first tag with the specified class in the passed HTML document"""
341 retval = get_elements_by_class(class_name, html)
342 return retval[0] if retval else None
345 def get_element_by_attribute(attribute, value, html, escape_value=True):
346 retval = get_elements_by_attribute(attribute, value, html, escape_value)
347 return retval[0] if retval else None
350 def get_elements_by_class(class_name, html):
351 """Return the content of all tags with the specified class in the passed HTML document as a list"""
352 return get_elements_by_attribute(
353 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
354 html, escape_value=False)
357 def get_elements_by_attribute(attribute, value, html, escape_value=True):
358 """Return the content of the tag with the specified attribute in the passed HTML document"""
360 value = re.escape(value) if escape_value else value
363 for m in re.finditer(r'''(?xs)
365 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
367 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
371 ''' % (re.escape(attribute), value), html):
372 res = m.group('content')
374 if res.startswith('"') or res.startswith("'"):
377 retlist.append(unescapeHTML(res))
382 class HTMLAttributeParser(compat_HTMLParser):
383 """Trivial HTML parser to gather the attributes for a single element"""
386 compat_HTMLParser.__init__(self)
388 def handle_starttag(self, tag, attrs):
389 self.attrs = dict(attrs)
392 def extract_attributes(html_element):
393 """Given a string for an HTML element such as
395 a="foo" B="bar" c="&98;az" d=boz
396 empty= noval entity="&"
399 Decode and return a dictionary of attributes.
401 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
402 'empty': '', 'noval': None, 'entity': '&',
403 'sq': '"', 'dq': '\''
405 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
406 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
408 parser = HTMLAttributeParser()
409 parser.feed(html_element)
414 def clean_html(html):
415 """Clean an HTML snippet into a readable string"""
417 if html is None: # Convenience for sanitizing descriptions etc.
421 html = html.replace('\n', ' ')
422 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
423 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
425 html = re.sub('<.*?>', '', html)
426 # Replace html entities
427 html = unescapeHTML(html)
431 def sanitize_open(filename, open_mode):
432 """Try to open the given filename, and slightly tweak it if this fails.
434 Attempts to open the given filename. If this fails, it tries to change
435 the filename slightly, step by step, until it's either able to open it
436 or it fails and raises a final exception, like the standard open()
439 It returns the tuple (stream, definitive_file_name).
443 if sys.platform == 'win32':
445 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
446 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
447 stream = open(encodeFilename(filename), open_mode)
448 return (stream, filename)
449 except (IOError, OSError) as err:
450 if err.errno in (errno.EACCES,):
453 # In case of error, try to remove win32 forbidden chars
454 alt_filename = sanitize_path(filename)
455 if alt_filename == filename:
458 # An exception here should be caught in the caller
459 stream = open(encodeFilename(alt_filename), open_mode)
460 return (stream, alt_filename)
463 def timeconvert(timestr):
464 """Convert RFC 2822 defined time string into system timestamp"""
466 timetuple = email.utils.parsedate_tz(timestr)
467 if timetuple is not None:
468 timestamp = email.utils.mktime_tz(timetuple)
472 def sanitize_filename(s, restricted=False, is_id=False):
473 """Sanitizes a string so it could be used as part of a filename.
474 If restricted is set, use a stricter subset of allowed characters.
475 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
477 def replace_insane(char):
478 if restricted and char in ACCENT_CHARS:
479 return ACCENT_CHARS[char]
480 if char == '?' or ord(char) < 32 or ord(char) == 127:
483 return '' if restricted else '\''
485 return '_-' if restricted else ' -'
486 elif char in '\\/|*<>':
488 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
490 if restricted and ord(char) > 127:
495 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
496 result = ''.join(map(replace_insane, s))
498 while '__' in result:
499 result = result.replace('__', '_')
500 result = result.strip('_')
501 # Common case of "Foreign band name - English song title"
502 if restricted and result.startswith('-_'):
504 if result.startswith('-'):
505 result = '_' + result[len('-'):]
506 result = result.lstrip('.')
512 def sanitize_path(s):
513 """Sanitizes and normalizes path on Windows"""
514 if sys.platform != 'win32':
516 drive_or_unc, _ = os.path.splitdrive(s)
517 if sys.version_info < (2, 7) and not drive_or_unc:
518 drive_or_unc, _ = os.path.splitunc(s)
519 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
523 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
524 for path_part in norm_path]
526 sanitized_path.insert(0, drive_or_unc + os.path.sep)
527 return os.path.join(*sanitized_path)
530 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
531 # unwanted failures due to missing protocol
532 def sanitize_url(url):
533 return 'http:%s' % url if url.startswith('//') else url
536 def sanitized_Request(url, *args, **kwargs):
537 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
540 def orderedSet(iterable):
541 """ Remove all duplicates from the input iterable """
549 def _htmlentity_transform(entity_with_semicolon):
550 """Transforms an HTML entity to a character."""
551 entity = entity_with_semicolon[:-1]
553 # Known non-numeric HTML entity
554 if entity in compat_html_entities.name2codepoint:
555 return compat_chr(compat_html_entities.name2codepoint[entity])
557 # TODO: HTML5 allows entities without a semicolon. For example,
558 # 'Éric' should be decoded as 'Éric'.
559 if entity_with_semicolon in compat_html_entities_html5:
560 return compat_html_entities_html5[entity_with_semicolon]
562 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
564 numstr = mobj.group(1)
565 if numstr.startswith('x'):
567 numstr = '0%s' % numstr
570 # See https://github.com/rg3/youtube-dl/issues/7518
572 return compat_chr(int(numstr, base))
576 # Unknown entity in name, return its literal representation
577 return '&%s;' % entity
583 assert type(s) == compat_str
586 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
589 def get_subprocess_encoding():
590 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
591 # For subprocess calls, encode with locale encoding
592 # Refer to http://stackoverflow.com/a/9951851/35070
593 encoding = preferredencoding()
595 encoding = sys.getfilesystemencoding()
601 def encodeFilename(s, for_subprocess=False):
603 @param s The name of the file
606 assert type(s) == compat_str
608 # Python 3 has a Unicode API
609 if sys.version_info >= (3, 0):
612 # Pass '' directly to use Unicode APIs on Windows 2000 and up
613 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
614 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
615 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
618 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
619 if sys.platform.startswith('java'):
622 return s.encode(get_subprocess_encoding(), 'ignore')
625 def decodeFilename(b, for_subprocess=False):
627 if sys.version_info >= (3, 0):
630 if not isinstance(b, bytes):
633 return b.decode(get_subprocess_encoding(), 'ignore')
636 def encodeArgument(s):
637 if not isinstance(s, compat_str):
638 # Legacy code that uses byte strings
639 # Uncomment the following line after fixing all post processors
640 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
641 s = s.decode('ascii')
642 return encodeFilename(s, True)
645 def decodeArgument(b):
646 return decodeFilename(b, True)
649 def decodeOption(optval):
652 if isinstance(optval, bytes):
653 optval = optval.decode(preferredencoding())
655 assert isinstance(optval, compat_str)
659 def formatSeconds(secs):
661 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
663 return '%d:%02d' % (secs // 60, secs % 60)
668 def make_HTTPS_handler(params, **kwargs):
669 opts_no_check_certificate = params.get('nocheckcertificate', False)
670 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
671 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
672 if opts_no_check_certificate:
673 context.check_hostname = False
674 context.verify_mode = ssl.CERT_NONE
676 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
679 # (create_default_context present but HTTPSHandler has no context=)
682 if sys.version_info < (3, 2):
683 return YoutubeDLHTTPSHandler(params, **kwargs)
685 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
686 context.verify_mode = (ssl.CERT_NONE
687 if opts_no_check_certificate
688 else ssl.CERT_REQUIRED)
689 context.set_default_verify_paths()
690 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
693 def bug_reports_message():
694 if ytdl_is_updateable():
695 update_cmd = 'type youtube-dl -U to update'
697 update_cmd = 'see https://yt-dl.org/update on how to update'
698 msg = '; please report this issue on https://yt-dl.org/bug .'
699 msg += ' Make sure you are using the latest version; %s.' % update_cmd
700 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
704 class YoutubeDLError(Exception):
705 """Base exception for YoutubeDL errors."""
709 class ExtractorError(YoutubeDLError):
710 """Error during info extraction."""
712 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
713 """ tb, if given, is the original traceback (so that it can be printed out).
714 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
717 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
719 if video_id is not None:
720 msg = video_id + ': ' + msg
722 msg += ' (caused by %r)' % cause
724 msg += bug_reports_message()
725 super(ExtractorError, self).__init__(msg)
728 self.exc_info = sys.exc_info() # preserve original exception
730 self.video_id = video_id
732 def format_traceback(self):
733 if self.traceback is None:
735 return ''.join(traceback.format_tb(self.traceback))
738 class UnsupportedError(ExtractorError):
739 def __init__(self, url):
740 super(UnsupportedError, self).__init__(
741 'Unsupported URL: %s' % url, expected=True)
745 class RegexNotFoundError(ExtractorError):
746 """Error when a regex didn't match"""
750 class DownloadError(YoutubeDLError):
751 """Download Error exception.
753 This exception may be thrown by FileDownloader objects if they are not
754 configured to continue on errors. They will contain the appropriate
758 def __init__(self, msg, exc_info=None):
759 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
760 super(DownloadError, self).__init__(msg)
761 self.exc_info = exc_info
764 class SameFileError(YoutubeDLError):
765 """Same File exception.
767 This exception will be thrown by FileDownloader objects if they detect
768 multiple files would have to be downloaded to the same file on disk.
773 class PostProcessingError(YoutubeDLError):
774 """Post Processing exception.
776 This exception may be raised by PostProcessor's .run() method to
777 indicate an error in the postprocessing task.
780 def __init__(self, msg):
781 super(PostProcessingError, self).__init__(msg)
785 class MaxDownloadsReached(YoutubeDLError):
786 """ --max-downloads limit has been reached. """
790 class UnavailableVideoError(YoutubeDLError):
791 """Unavailable Format exception.
793 This exception will be thrown when a video is requested
794 in a format that is not available for that video.
799 class ContentTooShortError(YoutubeDLError):
800 """Content Too Short exception.
802 This exception may be raised by FileDownloader objects when a file they
803 download is too small for what the server announced first, indicating
804 the connection was probably interrupted.
807 def __init__(self, downloaded, expected):
808 super(ContentTooShortError, self).__init__(
809 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
812 self.downloaded = downloaded
813 self.expected = expected
816 class XAttrMetadataError(YoutubeDLError):
817 def __init__(self, code=None, msg='Unknown error'):
818 super(XAttrMetadataError, self).__init__(msg)
822 # Parsing code and msg
823 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
824 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
825 self.reason = 'NO_SPACE'
826 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
827 self.reason = 'VALUE_TOO_LONG'
829 self.reason = 'NOT_SUPPORTED'
832 class XAttrUnavailableError(YoutubeDLError):
836 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
837 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
838 # expected HTTP responses to meet HTTP/1.0 or later (see also
839 # https://github.com/rg3/youtube-dl/issues/6727)
840 if sys.version_info < (3, 0):
841 kwargs[b'strict'] = True
842 hc = http_class(*args, **kwargs)
843 source_address = ydl_handler._params.get('source_address')
844 if source_address is not None:
845 sa = (source_address, 0)
846 if hasattr(hc, 'source_address'): # Python 2.7+
847 hc.source_address = sa
849 def _hc_connect(self, *args, **kwargs):
850 sock = compat_socket_create_connection(
851 (self.host, self.port), self.timeout, sa)
853 self.sock = ssl.wrap_socket(
854 sock, self.key_file, self.cert_file,
855 ssl_version=ssl.PROTOCOL_TLSv1)
858 hc.connect = functools.partial(_hc_connect, hc)
863 def handle_youtubedl_headers(headers):
864 filtered_headers = headers
866 if 'Youtubedl-no-compression' in filtered_headers:
867 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
868 del filtered_headers['Youtubedl-no-compression']
870 return filtered_headers
873 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
874 """Handler for HTTP requests and responses.
876 This class, when installed with an OpenerDirector, automatically adds
877 the standard headers to every HTTP request and handles gzipped and
878 deflated responses from web servers. If compression is to be avoided in
879 a particular request, the original request in the program code only has
880 to include the HTTP header "Youtubedl-no-compression", which will be
881 removed before making the real request.
883 Part of this code was copied from:
885 http://techknack.net/python-urllib2-handlers/
887 Andrew Rowls, the author of that code, agreed to release it to the
891 def __init__(self, params, *args, **kwargs):
892 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
893 self._params = params
895 def http_open(self, req):
896 conn_class = compat_http_client.HTTPConnection
898 socks_proxy = req.headers.get('Ytdl-socks-proxy')
900 conn_class = make_socks_conn_class(conn_class, socks_proxy)
901 del req.headers['Ytdl-socks-proxy']
903 return self.do_open(functools.partial(
904 _create_http_connection, self, conn_class, False),
910 return zlib.decompress(data, -zlib.MAX_WBITS)
912 return zlib.decompress(data)
915 def addinfourl_wrapper(stream, headers, url, code):
916 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
917 return compat_urllib_request.addinfourl(stream, headers, url, code)
918 ret = compat_urllib_request.addinfourl(stream, headers, url)
922 def http_request(self, req):
923 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
924 # always respected by websites, some tend to give out URLs with non percent-encoded
925 # non-ASCII characters (see telemb.py, ard.py [#3412])
926 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
927 # To work around aforementioned issue we will replace request's original URL with
928 # percent-encoded one
929 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
930 # the code of this workaround has been moved here from YoutubeDL.urlopen()
931 url = req.get_full_url()
932 url_escaped = escape_url(url)
934 # Substitute URL if any change after escaping
935 if url != url_escaped:
936 req = update_Request(req, url=url_escaped)
938 for h, v in std_headers.items():
939 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
940 # The dict keys are capitalized because of this bug by urllib
941 if h.capitalize() not in req.headers:
944 req.headers = handle_youtubedl_headers(req.headers)
946 if sys.version_info < (2, 7) and '#' in req.get_full_url():
947 # Python 2.6 is brain-dead when it comes to fragments
948 req._Request__original = req._Request__original.partition('#')[0]
949 req._Request__r_type = req._Request__r_type.partition('#')[0]
953 def http_response(self, req, resp):
956 if resp.headers.get('Content-encoding', '') == 'gzip':
957 content = resp.read()
958 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
960 uncompressed = io.BytesIO(gz.read())
961 except IOError as original_ioerror:
962 # There may be junk add the end of the file
963 # See http://stackoverflow.com/q/4928560/35070 for details
964 for i in range(1, 1024):
966 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
967 uncompressed = io.BytesIO(gz.read())
972 raise original_ioerror
973 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
974 resp.msg = old_resp.msg
975 del resp.headers['Content-encoding']
977 if resp.headers.get('Content-encoding', '') == 'deflate':
978 gz = io.BytesIO(self.deflate(resp.read()))
979 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
980 resp.msg = old_resp.msg
981 del resp.headers['Content-encoding']
982 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
983 # https://github.com/rg3/youtube-dl/issues/6457).
984 if 300 <= resp.code < 400:
985 location = resp.headers.get('Location')
987 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
988 if sys.version_info >= (3, 0):
989 location = location.encode('iso-8859-1').decode('utf-8')
991 location = location.decode('utf-8')
992 location_escaped = escape_url(location)
993 if location != location_escaped:
994 del resp.headers['Location']
995 if sys.version_info < (3, 0):
996 location_escaped = location_escaped.encode('utf-8')
997 resp.headers['Location'] = location_escaped
1000 https_request = http_request
1001 https_response = http_response
1004 def make_socks_conn_class(base_class, socks_proxy):
1005 assert issubclass(base_class, (
1006 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1008 url_components = compat_urlparse.urlparse(socks_proxy)
1009 if url_components.scheme.lower() == 'socks5':
1010 socks_type = ProxyType.SOCKS5
1011 elif url_components.scheme.lower() in ('socks', 'socks4'):
1012 socks_type = ProxyType.SOCKS4
1013 elif url_components.scheme.lower() == 'socks4a':
1014 socks_type = ProxyType.SOCKS4A
1016 def unquote_if_non_empty(s):
1019 return compat_urllib_parse_unquote_plus(s)
1023 url_components.hostname, url_components.port or 1080,
1025 unquote_if_non_empty(url_components.username),
1026 unquote_if_non_empty(url_components.password),
1029 class SocksConnection(base_class):
1031 self.sock = sockssocket()
1032 self.sock.setproxy(*proxy_args)
1033 if type(self.timeout) in (int, float):
1034 self.sock.settimeout(self.timeout)
1035 self.sock.connect((self.host, self.port))
1037 if isinstance(self, compat_http_client.HTTPSConnection):
1038 if hasattr(self, '_context'): # Python > 2.6
1039 self.sock = self._context.wrap_socket(
1040 self.sock, server_hostname=self.host)
1042 self.sock = ssl.wrap_socket(self.sock)
1044 return SocksConnection
1047 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1048 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1049 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1050 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1051 self._params = params
1053 def https_open(self, req):
1055 conn_class = self._https_conn_class
1057 if hasattr(self, '_context'): # python > 2.6
1058 kwargs['context'] = self._context
1059 if hasattr(self, '_check_hostname'): # python 3.x
1060 kwargs['check_hostname'] = self._check_hostname
1062 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1064 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1065 del req.headers['Ytdl-socks-proxy']
1067 return self.do_open(functools.partial(
1068 _create_http_connection, self, conn_class, True),
1072 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1073 def __init__(self, cookiejar=None):
1074 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1076 def http_response(self, request, response):
1077 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1078 # characters in Set-Cookie HTTP header of last response (see
1079 # https://github.com/rg3/youtube-dl/issues/6769).
1080 # In order to at least prevent crashing we will percent encode Set-Cookie
1081 # header before HTTPCookieProcessor starts processing it.
1082 # if sys.version_info < (3, 0) and response.headers:
1083 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1084 # set_cookie = response.headers.get(set_cookie_header)
1086 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1087 # if set_cookie != set_cookie_escaped:
1088 # del response.headers[set_cookie_header]
1089 # response.headers[set_cookie_header] = set_cookie_escaped
1090 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1092 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1093 https_response = http_response
1096 def extract_timezone(date_str):
1098 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1101 timezone = datetime.timedelta()
1103 date_str = date_str[:-len(m.group('tz'))]
1104 if not m.group('sign'):
1105 timezone = datetime.timedelta()
1107 sign = 1 if m.group('sign') == '+' else -1
1108 timezone = datetime.timedelta(
1109 hours=sign * int(m.group('hours')),
1110 minutes=sign * int(m.group('minutes')))
1111 return timezone, date_str
1114 def parse_iso8601(date_str, delimiter='T', timezone=None):
1115 """ Return a UNIX timestamp from the given date """
1117 if date_str is None:
1120 date_str = re.sub(r'\.[0-9]+', '', date_str)
1122 if timezone is None:
1123 timezone, date_str = extract_timezone(date_str)
1126 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1127 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1128 return calendar.timegm(dt.timetuple())
1133 def date_formats(day_first=True):
1134 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1137 def unified_strdate(date_str, day_first=True):
1138 """Return a string with the date in the format YYYYMMDD"""
1140 if date_str is None:
1144 date_str = date_str.replace(',', ' ')
1145 # Remove AM/PM + timezone
1146 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1147 _, date_str = extract_timezone(date_str)
1149 for expression in date_formats(day_first):
1151 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1154 if upload_date is None:
1155 timetuple = email.utils.parsedate_tz(date_str)
1158 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1161 if upload_date is not None:
1162 return compat_str(upload_date)
1165 def unified_timestamp(date_str, day_first=True):
1166 if date_str is None:
1169 date_str = date_str.replace(',', ' ')
1171 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1172 timezone, date_str = extract_timezone(date_str)
1174 # Remove AM/PM + timezone
1175 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1177 for expression in date_formats(day_first):
1179 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1180 return calendar.timegm(dt.timetuple())
1183 timetuple = email.utils.parsedate_tz(date_str)
1185 return calendar.timegm(timetuple) + pm_delta * 3600
1188 def determine_ext(url, default_ext='unknown_video'):
1191 guess = url.partition('?')[0].rpartition('.')[2]
1192 if re.match(r'^[A-Za-z0-9]+$', guess):
1194 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1195 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1196 return guess.rstrip('/')
1201 def subtitles_filename(filename, sub_lang, sub_format):
1202 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1205 def date_from_str(date_str):
1207 Return a datetime object from a string in the format YYYYMMDD or
1208 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1209 today = datetime.date.today()
1210 if date_str in ('now', 'today'):
1212 if date_str == 'yesterday':
1213 return today - datetime.timedelta(days=1)
1214 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1215 if match is not None:
1216 sign = match.group('sign')
1217 time = int(match.group('time'))
1220 unit = match.group('unit')
1221 # A bad approximation?
1225 elif unit == 'year':
1229 delta = datetime.timedelta(**{unit: time})
1230 return today + delta
1231 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1234 def hyphenate_date(date_str):
1236 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1237 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1238 if match is not None:
1239 return '-'.join(match.groups())
1244 class DateRange(object):
1245 """Represents a time interval between two dates"""
1247 def __init__(self, start=None, end=None):
1248 """start and end must be strings in the format accepted by date"""
1249 if start is not None:
1250 self.start = date_from_str(start)
1252 self.start = datetime.datetime.min.date()
1254 self.end = date_from_str(end)
1256 self.end = datetime.datetime.max.date()
1257 if self.start > self.end:
1258 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1262 """Returns a range that only contains the given day"""
1263 return cls(day, day)
1265 def __contains__(self, date):
1266 """Check if the date is in the range"""
1267 if not isinstance(date, datetime.date):
1268 date = date_from_str(date)
1269 return self.start <= date <= self.end
1272 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1275 def platform_name():
1276 """ Returns the platform name as a compat_str """
1277 res = platform.platform()
1278 if isinstance(res, bytes):
1279 res = res.decode(preferredencoding())
1281 assert isinstance(res, compat_str)
1285 def _windows_write_string(s, out):
1286 """ Returns True if the string was written using special methods,
1287 False if it has yet to be written out."""
1288 # Adapted from http://stackoverflow.com/a/3259271/35070
1291 import ctypes.wintypes
1299 fileno = out.fileno()
1300 except AttributeError:
1301 # If the output stream doesn't have a fileno, it's virtual
1303 except io.UnsupportedOperation:
1304 # Some strange Windows pseudo files?
1306 if fileno not in WIN_OUTPUT_IDS:
1309 GetStdHandle = ctypes.WINFUNCTYPE(
1310 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1311 (b'GetStdHandle', ctypes.windll.kernel32))
1312 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1314 WriteConsoleW = ctypes.WINFUNCTYPE(
1315 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1316 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1317 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1318 written = ctypes.wintypes.DWORD(0)
1320 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1321 FILE_TYPE_CHAR = 0x0002
1322 FILE_TYPE_REMOTE = 0x8000
1323 GetConsoleMode = ctypes.WINFUNCTYPE(
1324 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1325 ctypes.POINTER(ctypes.wintypes.DWORD))(
1326 (b'GetConsoleMode', ctypes.windll.kernel32))
1327 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1329 def not_a_console(handle):
1330 if handle == INVALID_HANDLE_VALUE or handle is None:
1332 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1333 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1335 if not_a_console(h):
1338 def next_nonbmp_pos(s):
1340 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1341 except StopIteration:
1345 count = min(next_nonbmp_pos(s), 1024)
1347 ret = WriteConsoleW(
1348 h, s, count if count else 2, ctypes.byref(written), None)
1350 raise OSError('Failed to write string')
1351 if not count: # We just wrote a non-BMP character
1352 assert written.value == 2
1355 assert written.value > 0
1356 s = s[written.value:]
1360 def write_string(s, out=None, encoding=None):
1363 assert type(s) == compat_str
1365 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1366 if _windows_write_string(s, out):
1369 if ('b' in getattr(out, 'mode', '') or
1370 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1371 byt = s.encode(encoding or preferredencoding(), 'ignore')
1373 elif hasattr(out, 'buffer'):
1374 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1375 byt = s.encode(enc, 'ignore')
1376 out.buffer.write(byt)
1382 def bytes_to_intlist(bs):
1385 if isinstance(bs[0], int): # Python 3
1388 return [ord(c) for c in bs]
1391 def intlist_to_bytes(xs):
1394 return compat_struct_pack('%dB' % len(xs), *xs)
1397 # Cross-platform file locking
1398 if sys.platform == 'win32':
1399 import ctypes.wintypes
1402 class OVERLAPPED(ctypes.Structure):
1404 ('Internal', ctypes.wintypes.LPVOID),
1405 ('InternalHigh', ctypes.wintypes.LPVOID),
1406 ('Offset', ctypes.wintypes.DWORD),
1407 ('OffsetHigh', ctypes.wintypes.DWORD),
1408 ('hEvent', ctypes.wintypes.HANDLE),
1411 kernel32 = ctypes.windll.kernel32
1412 LockFileEx = kernel32.LockFileEx
1413 LockFileEx.argtypes = [
1414 ctypes.wintypes.HANDLE, # hFile
1415 ctypes.wintypes.DWORD, # dwFlags
1416 ctypes.wintypes.DWORD, # dwReserved
1417 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1418 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1419 ctypes.POINTER(OVERLAPPED) # Overlapped
1421 LockFileEx.restype = ctypes.wintypes.BOOL
1422 UnlockFileEx = kernel32.UnlockFileEx
1423 UnlockFileEx.argtypes = [
1424 ctypes.wintypes.HANDLE, # hFile
1425 ctypes.wintypes.DWORD, # dwReserved
1426 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1427 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1428 ctypes.POINTER(OVERLAPPED) # Overlapped
1430 UnlockFileEx.restype = ctypes.wintypes.BOOL
1431 whole_low = 0xffffffff
1432 whole_high = 0x7fffffff
1434 def _lock_file(f, exclusive):
1435 overlapped = OVERLAPPED()
1436 overlapped.Offset = 0
1437 overlapped.OffsetHigh = 0
1438 overlapped.hEvent = 0
1439 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1440 handle = msvcrt.get_osfhandle(f.fileno())
1441 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1442 whole_low, whole_high, f._lock_file_overlapped_p):
1443 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1445 def _unlock_file(f):
1446 assert f._lock_file_overlapped_p
1447 handle = msvcrt.get_osfhandle(f.fileno())
1448 if not UnlockFileEx(handle, 0,
1449 whole_low, whole_high, f._lock_file_overlapped_p):
1450 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1453 # Some platforms, such as Jython, is missing fcntl
1457 def _lock_file(f, exclusive):
1458 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1460 def _unlock_file(f):
1461 fcntl.flock(f, fcntl.LOCK_UN)
1463 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1465 def _lock_file(f, exclusive):
1466 raise IOError(UNSUPPORTED_MSG)
1468 def _unlock_file(f):
1469 raise IOError(UNSUPPORTED_MSG)
1472 class locked_file(object):
1473 def __init__(self, filename, mode, encoding=None):
1474 assert mode in ['r', 'a', 'w']
1475 self.f = io.open(filename, mode, encoding=encoding)
1478 def __enter__(self):
1479 exclusive = self.mode != 'r'
1481 _lock_file(self.f, exclusive)
1487 def __exit__(self, etype, value, traceback):
1489 _unlock_file(self.f)
1496 def write(self, *args):
1497 return self.f.write(*args)
1499 def read(self, *args):
1500 return self.f.read(*args)
1503 def get_filesystem_encoding():
1504 encoding = sys.getfilesystemencoding()
1505 return encoding if encoding is not None else 'utf-8'
1508 def shell_quote(args):
1510 encoding = get_filesystem_encoding()
1512 if isinstance(a, bytes):
1513 # We may get a filename encoded with 'encodeFilename'
1514 a = a.decode(encoding)
1515 quoted_args.append(pipes.quote(a))
1516 return ' '.join(quoted_args)
1519 def smuggle_url(url, data):
1520 """ Pass additional data in a URL for internal use. """
1522 url, idata = unsmuggle_url(url, {})
1524 sdata = compat_urllib_parse_urlencode(
1525 {'__youtubedl_smuggle': json.dumps(data)})
1526 return url + '#' + sdata
1529 def unsmuggle_url(smug_url, default=None):
1530 if '#__youtubedl_smuggle' not in smug_url:
1531 return smug_url, default
1532 url, _, sdata = smug_url.rpartition('#')
1533 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1534 data = json.loads(jsond)
1538 def format_bytes(bytes):
1541 if type(bytes) is str:
1542 bytes = float(bytes)
1546 exponent = int(math.log(bytes, 1024.0))
1547 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1548 converted = float(bytes) / float(1024 ** exponent)
1549 return '%.2f%s' % (converted, suffix)
1552 def lookup_unit_table(unit_table, s):
1553 units_re = '|'.join(re.escape(u) for u in unit_table)
1555 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1558 num_str = m.group('num').replace(',', '.')
1559 mult = unit_table[m.group('unit')]
1560 return int(float(num_str) * mult)
1563 def parse_filesize(s):
1567 # The lower-case forms are of course incorrect and unofficial,
1568 # but we support those too
1585 'megabytes': 1000 ** 2,
1586 'mebibytes': 1024 ** 2,
1592 'gigabytes': 1000 ** 3,
1593 'gibibytes': 1024 ** 3,
1599 'terabytes': 1000 ** 4,
1600 'tebibytes': 1024 ** 4,
1606 'petabytes': 1000 ** 5,
1607 'pebibytes': 1024 ** 5,
1613 'exabytes': 1000 ** 6,
1614 'exbibytes': 1024 ** 6,
1620 'zettabytes': 1000 ** 7,
1621 'zebibytes': 1024 ** 7,
1627 'yottabytes': 1000 ** 8,
1628 'yobibytes': 1024 ** 8,
1631 return lookup_unit_table(_UNIT_TABLE, s)
1640 if re.match(r'^[\d,.]+$', s):
1641 return str_to_int(s)
1652 return lookup_unit_table(_UNIT_TABLE, s)
1655 def month_by_name(name, lang='en'):
1656 """ Return the number of a month by (locale-independently) English name """
1658 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1661 return month_names.index(name) + 1
1666 def month_by_abbreviation(abbrev):
1667 """ Return the number of a month by (locale-independently) English
1671 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1676 def fix_xml_ampersands(xml_str):
1677 """Replace all the '&' by '&' in XML"""
1679 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1684 def setproctitle(title):
1685 assert isinstance(title, compat_str)
1687 # ctypes in Jython is not complete
1688 # http://bugs.jython.org/issue2148
1689 if sys.platform.startswith('java'):
1693 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1697 # LoadLibrary in Windows Python 2.7.13 only expects
1698 # a bytestring, but since unicode_literals turns
1699 # every string into a unicode string, it fails.
1701 title_bytes = title.encode('utf-8')
1702 buf = ctypes.create_string_buffer(len(title_bytes))
1703 buf.value = title_bytes
1705 libc.prctl(15, buf, 0, 0, 0)
1706 except AttributeError:
1707 return # Strange libc, just skip this
1710 def remove_start(s, start):
1711 return s[len(start):] if s is not None and s.startswith(start) else s
1714 def remove_end(s, end):
1715 return s[:-len(end)] if s is not None and s.endswith(end) else s
1718 def remove_quotes(s):
1719 if s is None or len(s) < 2:
1721 for quote in ('"', "'", ):
1722 if s[0] == quote and s[-1] == quote:
1727 def url_basename(url):
1728 path = compat_urlparse.urlparse(url).path
1729 return path.strip('/').split('/')[-1]
1733 return re.match(r'https?://[^?#&]+/', url).group()
1736 def urljoin(base, path):
1737 if not isinstance(path, compat_str) or not path:
1739 if re.match(r'^(?:https?:)?//', path):
1741 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1743 return compat_urlparse.urljoin(base, path)
1746 class HEADRequest(compat_urllib_request.Request):
1747 def get_method(self):
1751 class PUTRequest(compat_urllib_request.Request):
1752 def get_method(self):
1756 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1759 v = getattr(v, get_attr, None)
1765 return int(v) * invscale // scale
1770 def str_or_none(v, default=None):
1771 return default if v is None else compat_str(v)
1774 def str_to_int(int_str):
1775 """ A more relaxed version of int_or_none """
1778 int_str = re.sub(r'[,\.\+]', '', int_str)
1782 def float_or_none(v, scale=1, invscale=1, default=None):
1786 return float(v) * invscale / scale
1791 def strip_or_none(v):
1792 return None if v is None else v.strip()
1795 def parse_duration(s):
1796 if not isinstance(s, compat_basestring):
1801 days, hours, mins, secs, ms = [None] * 5
1802 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1804 days, hours, mins, secs, ms = m.groups()
1809 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1812 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1815 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1818 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1821 days, hours, mins, secs, ms = m.groups()
1823 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1825 hours, mins = m.groups()
1831 duration += float(secs)
1833 duration += float(mins) * 60
1835 duration += float(hours) * 60 * 60
1837 duration += float(days) * 24 * 60 * 60
1839 duration += float(ms)
1843 def prepend_extension(filename, ext, expected_real_ext=None):
1844 name, real_ext = os.path.splitext(filename)
1846 '{0}.{1}{2}'.format(name, ext, real_ext)
1847 if not expected_real_ext or real_ext[1:] == expected_real_ext
1848 else '{0}.{1}'.format(filename, ext))
1851 def replace_extension(filename, ext, expected_real_ext=None):
1852 name, real_ext = os.path.splitext(filename)
1853 return '{0}.{1}'.format(
1854 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1858 def check_executable(exe, args=[]):
1859 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1860 args can be a list of arguments for a short output (like -version) """
1862 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1868 def get_exe_version(exe, args=['--version'],
1869 version_re=None, unrecognized='present'):
1870 """ Returns the version of the specified executable,
1871 or False if the executable is not present """
1873 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1874 # SIGTTOU if youtube-dl is run in the background.
1875 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1876 out, _ = subprocess.Popen(
1877 [encodeArgument(exe)] + args,
1878 stdin=subprocess.PIPE,
1879 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1882 if isinstance(out, bytes): # Python 2.x
1883 out = out.decode('ascii', 'ignore')
1884 return detect_exe_version(out, version_re, unrecognized)
1887 def detect_exe_version(output, version_re=None, unrecognized='present'):
1888 assert isinstance(output, compat_str)
1889 if version_re is None:
1890 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1891 m = re.search(version_re, output)
1898 class PagedList(object):
1900 # This is only useful for tests
1901 return len(self.getslice())
1904 class OnDemandPagedList(PagedList):
1905 def __init__(self, pagefunc, pagesize, use_cache=False):
1906 self._pagefunc = pagefunc
1907 self._pagesize = pagesize
1908 self._use_cache = use_cache
1912 def getslice(self, start=0, end=None):
1914 for pagenum in itertools.count(start // self._pagesize):
1915 firstid = pagenum * self._pagesize
1916 nextfirstid = pagenum * self._pagesize + self._pagesize
1917 if start >= nextfirstid:
1922 page_results = self._cache.get(pagenum)
1923 if page_results is None:
1924 page_results = list(self._pagefunc(pagenum))
1926 self._cache[pagenum] = page_results
1929 start % self._pagesize
1930 if firstid <= start < nextfirstid
1934 ((end - 1) % self._pagesize) + 1
1935 if (end is not None and firstid <= end <= nextfirstid)
1938 if startv != 0 or endv is not None:
1939 page_results = page_results[startv:endv]
1940 res.extend(page_results)
1942 # A little optimization - if current page is not "full", ie. does
1943 # not contain page_size videos then we can assume that this page
1944 # is the last one - there are no more ids on further pages -
1945 # i.e. no need to query again.
1946 if len(page_results) + startv < self._pagesize:
1949 # If we got the whole page, but the next page is not interesting,
1950 # break out early as well
1951 if end == nextfirstid:
1956 class InAdvancePagedList(PagedList):
1957 def __init__(self, pagefunc, pagecount, pagesize):
1958 self._pagefunc = pagefunc
1959 self._pagecount = pagecount
1960 self._pagesize = pagesize
1962 def getslice(self, start=0, end=None):
1964 start_page = start // self._pagesize
1966 self._pagecount if end is None else (end // self._pagesize + 1))
1967 skip_elems = start - start_page * self._pagesize
1968 only_more = None if end is None else end - start
1969 for pagenum in range(start_page, end_page):
1970 page = list(self._pagefunc(pagenum))
1972 page = page[skip_elems:]
1974 if only_more is not None:
1975 if len(page) < only_more:
1976 only_more -= len(page)
1978 page = page[:only_more]
1985 def uppercase_escape(s):
1986 unicode_escape = codecs.getdecoder('unicode_escape')
1988 r'\\U[0-9a-fA-F]{8}',
1989 lambda m: unicode_escape(m.group(0))[0],
1993 def lowercase_escape(s):
1994 unicode_escape = codecs.getdecoder('unicode_escape')
1996 r'\\u[0-9a-fA-F]{4}',
1997 lambda m: unicode_escape(m.group(0))[0],
2001 def escape_rfc3986(s):
2002 """Escape non-ASCII characters as suggested by RFC 3986"""
2003 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2004 s = s.encode('utf-8')
2005 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2008 def escape_url(url):
2009 """Escape URL as suggested by RFC 3986"""
2010 url_parsed = compat_urllib_parse_urlparse(url)
2011 return url_parsed._replace(
2012 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2013 path=escape_rfc3986(url_parsed.path),
2014 params=escape_rfc3986(url_parsed.params),
2015 query=escape_rfc3986(url_parsed.query),
2016 fragment=escape_rfc3986(url_parsed.fragment)
2020 def read_batch_urls(batch_fd):
2022 if not isinstance(url, compat_str):
2023 url = url.decode('utf-8', 'replace')
2024 BOM_UTF8 = '\xef\xbb\xbf'
2025 if url.startswith(BOM_UTF8):
2026 url = url[len(BOM_UTF8):]
2028 if url.startswith(('#', ';', ']')):
2032 with contextlib.closing(batch_fd) as fd:
2033 return [url for url in map(fixup, fd) if url]
2036 def urlencode_postdata(*args, **kargs):
2037 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2040 def update_url_query(url, query):
2043 parsed_url = compat_urlparse.urlparse(url)
2044 qs = compat_parse_qs(parsed_url.query)
2046 return compat_urlparse.urlunparse(parsed_url._replace(
2047 query=compat_urllib_parse_urlencode(qs, True)))
2050 def update_Request(req, url=None, data=None, headers={}, query={}):
2051 req_headers = req.headers.copy()
2052 req_headers.update(headers)
2053 req_data = data or req.data
2054 req_url = update_url_query(url or req.get_full_url(), query)
2055 req_get_method = req.get_method()
2056 if req_get_method == 'HEAD':
2057 req_type = HEADRequest
2058 elif req_get_method == 'PUT':
2059 req_type = PUTRequest
2061 req_type = compat_urllib_request.Request
2063 req_url, data=req_data, headers=req_headers,
2064 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2065 if hasattr(req, 'timeout'):
2066 new_req.timeout = req.timeout
2070 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2071 if isinstance(key_or_keys, (list, tuple)):
2072 for key in key_or_keys:
2073 if key not in d or d[key] is None or skip_false_values and not d[key]:
2077 return d.get(key_or_keys, default)
2080 def try_get(src, getter, expected_type=None):
2083 except (AttributeError, KeyError, TypeError, IndexError):
2086 if expected_type is None or isinstance(v, expected_type):
2090 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2091 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2103 TV_PARENTAL_GUIDELINES = {
2113 def parse_age_limit(s):
2115 return s if 0 <= s <= 21 else None
2116 if not isinstance(s, compat_basestring):
2118 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2120 return int(m.group('age'))
2122 return US_RATINGS[s]
2123 return TV_PARENTAL_GUIDELINES.get(s)
2126 def strip_jsonp(code):
2128 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2131 def js_to_json(code):
2132 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2133 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2135 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2136 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2141 if v in ('true', 'false', 'null'):
2143 elif v.startswith('/*') or v.startswith('//') or v == ',':
2146 if v[0] in ("'", '"'):
2147 v = re.sub(r'(?s)\\.|"', lambda m: {
2152 }.get(m.group(0), m.group(0)), v[1:-1])
2154 for regex, base in INTEGER_TABLE:
2155 im = re.match(regex, v)
2157 i = int(im.group(1), base)
2158 return '"%d":' % i if v.endswith(':') else '%d' % i
2162 return re.sub(r'''(?sx)
2163 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2164 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2165 {comment}|,(?={skip}[\]}}])|
2166 [a-zA-Z_][.a-zA-Z_0-9]*|
2167 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2169 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2172 def qualities(quality_ids):
2173 """ Get a numeric quality value out of a list of possible values """
2176 return quality_ids.index(qid)
2182 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2185 def limit_length(s, length):
2186 """ Add ellipses to overly long strings """
2191 return s[:length - len(ELLIPSES)] + ELLIPSES
2195 def version_tuple(v):
2196 return tuple(int(e) for e in re.split(r'[-.]', v))
2199 def is_outdated_version(version, limit, assume_new=True):
2201 return not assume_new
2203 return version_tuple(version) < version_tuple(limit)
2205 return not assume_new
2208 def ytdl_is_updateable():
2209 """ Returns if youtube-dl can be updated with -U """
2210 from zipimport import zipimporter
2212 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2215 def args_to_str(args):
2216 # Get a short string representation for a subprocess command
2217 return ' '.join(compat_shlex_quote(a) for a in args)
2220 def error_to_compat_str(err):
2222 # On python 2 error byte string must be decoded with proper
2223 # encoding rather than ascii
2224 if sys.version_info[0] < 3:
2225 err_str = err_str.decode(preferredencoding())
2229 def mimetype2ext(mt):
2235 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2236 # it's the most popular one
2237 'audio/mpeg': 'mp3',
2242 _, _, res = mt.rpartition('/')
2243 res = res.split(';')[0].strip().lower()
2247 'smptett+xml': 'tt',
2253 'x-mp4-fragmented': 'mp4',
2256 'x-mpegurl': 'm3u8',
2257 'vnd.apple.mpegurl': 'm3u8',
2262 'vnd.ms-sstr+xml': 'ism',
2267 def parse_codecs(codecs_str):
2268 # http://tools.ietf.org/html/rfc6381
2271 splited_codecs = list(filter(None, map(
2272 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2273 vcodec, acodec = None, None
2274 for full_codec in splited_codecs:
2275 codec = full_codec.split('.')[0]
2276 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2279 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2283 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2284 if not vcodec and not acodec:
2285 if len(splited_codecs) == 2:
2290 elif len(splited_codecs) == 1:
2297 'vcodec': vcodec or 'none',
2298 'acodec': acodec or 'none',
2303 def urlhandle_detect_ext(url_handle):
2304 getheader = url_handle.headers.get
2306 cd = getheader('Content-Disposition')
2308 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2310 e = determine_ext(m.group('filename'), default_ext=None)
2314 return mimetype2ext(getheader('Content-Type'))
2317 def encode_data_uri(data, mime_type):
2318 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2321 def age_restricted(content_limit, age_limit):
2322 """ Returns True iff the content should be blocked """
2324 if age_limit is None: # No limit set
2326 if content_limit is None:
2327 return False # Content available for everyone
2328 return age_limit < content_limit
2331 def is_html(first_bytes):
2332 """ Detect whether a file contains HTML by examining its first bytes. """
2335 (b'\xef\xbb\xbf', 'utf-8'),
2336 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2337 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2338 (b'\xff\xfe', 'utf-16-le'),
2339 (b'\xfe\xff', 'utf-16-be'),
2341 for bom, enc in BOMS:
2342 if first_bytes.startswith(bom):
2343 s = first_bytes[len(bom):].decode(enc, 'replace')
2346 s = first_bytes.decode('utf-8', 'replace')
2348 return re.match(r'^\s*<', s)
2351 def determine_protocol(info_dict):
2352 protocol = info_dict.get('protocol')
2353 if protocol is not None:
2356 url = info_dict['url']
2357 if url.startswith('rtmp'):
2359 elif url.startswith('mms'):
2361 elif url.startswith('rtsp'):
2364 ext = determine_ext(url)
2370 return compat_urllib_parse_urlparse(url).scheme
2373 def render_table(header_row, data):
2374 """ Render a list of rows, each as a list of values """
2375 table = [header_row] + data
2376 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2377 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2378 return '\n'.join(format_str % tuple(row) for row in table)
2381 def _match_one(filter_part, dct):
2382 COMPARISON_OPERATORS = {
2390 operator_rex = re.compile(r'''(?x)\s*
2392 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2394 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2395 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2396 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2399 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2400 m = operator_rex.search(filter_part)
2402 op = COMPARISON_OPERATORS[m.group('op')]
2403 actual_value = dct.get(m.group('key'))
2404 if (m.group('quotedstrval') is not None or
2405 m.group('strval') is not None or
2406 # If the original field is a string and matching comparisonvalue is
2407 # a number we should respect the origin of the original field
2408 # and process comparison value as a string (see
2409 # https://github.com/rg3/youtube-dl/issues/11082).
2410 actual_value is not None and m.group('intval') is not None and
2411 isinstance(actual_value, compat_str)):
2412 if m.group('op') not in ('=', '!='):
2414 'Operator %s does not support string values!' % m.group('op'))
2415 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2416 quote = m.group('quote')
2417 if quote is not None:
2418 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2421 comparison_value = int(m.group('intval'))
2423 comparison_value = parse_filesize(m.group('intval'))
2424 if comparison_value is None:
2425 comparison_value = parse_filesize(m.group('intval') + 'B')
2426 if comparison_value is None:
2428 'Invalid integer value %r in filter part %r' % (
2429 m.group('intval'), filter_part))
2430 if actual_value is None:
2431 return m.group('none_inclusive')
2432 return op(actual_value, comparison_value)
2435 '': lambda v: v is not None,
2436 '!': lambda v: v is None,
2438 operator_rex = re.compile(r'''(?x)\s*
2439 (?P<op>%s)\s*(?P<key>[a-z_]+)
2441 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2442 m = operator_rex.search(filter_part)
2444 op = UNARY_OPERATORS[m.group('op')]
2445 actual_value = dct.get(m.group('key'))
2446 return op(actual_value)
2448 raise ValueError('Invalid filter part %r' % filter_part)
2451 def match_str(filter_str, dct):
2452 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2455 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2458 def match_filter_func(filter_str):
2459 def _match_func(info_dict):
2460 if match_str(filter_str, info_dict):
2463 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2464 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2468 def parse_dfxp_time_expr(time_expr):
2472 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2474 return float(mobj.group('time_offset'))
2476 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2478 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2481 def srt_subtitles_timecode(seconds):
2482 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2485 def dfxp2srt(dfxp_data):
2486 _x = functools.partial(xpath_with_ns, ns_map={
2487 'ttml': 'http://www.w3.org/ns/ttml',
2488 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2489 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2492 class TTMLPElementParser(object):
2495 def start(self, tag, attrib):
2496 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2502 def data(self, data):
2506 return self.out.strip()
2508 def parse_node(node):
2509 target = TTMLPElementParser()
2510 parser = xml.etree.ElementTree.XMLParser(target=target)
2511 parser.feed(xml.etree.ElementTree.tostring(node))
2512 return parser.close()
2514 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2516 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2519 raise ValueError('Invalid dfxp/TTML subtitle')
2521 for para, index in zip(paras, itertools.count(1)):
2522 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2523 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2524 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2525 if begin_time is None:
2530 end_time = begin_time + dur
2531 out.append('%d\n%s --> %s\n%s\n\n' % (
2533 srt_subtitles_timecode(begin_time),
2534 srt_subtitles_timecode(end_time),
2540 def cli_option(params, command_option, param):
2541 param = params.get(param)
2543 param = compat_str(param)
2544 return [command_option, param] if param is not None else []
2547 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2548 param = params.get(param)
2549 assert isinstance(param, bool)
2551 return [command_option + separator + (true_value if param else false_value)]
2552 return [command_option, true_value if param else false_value]
2555 def cli_valueless_option(params, command_option, param, expected_value=True):
2556 param = params.get(param)
2557 return [command_option] if param == expected_value else []
2560 def cli_configuration_args(params, param, default=[]):
2561 ex_args = params.get(param)
2564 assert isinstance(ex_args, list)
2568 class ISO639Utils(object):
2569 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2758 def short2long(cls, code):
2759 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2760 return cls._lang_map.get(code[:2])
2763 def long2short(cls, code):
2764 """Convert language code from ISO 639-2/T to ISO 639-1"""
2765 for short_name, long_name in cls._lang_map.items():
2766 if long_name == code:
2770 class ISO3166Utils(object):
2771 # From http://data.okfn.org/data/core/country-list
2773 'AF': 'Afghanistan',
2774 'AX': 'Åland Islands',
2777 'AS': 'American Samoa',
2782 'AG': 'Antigua and Barbuda',
2799 'BO': 'Bolivia, Plurinational State of',
2800 'BQ': 'Bonaire, Sint Eustatius and Saba',
2801 'BA': 'Bosnia and Herzegovina',
2803 'BV': 'Bouvet Island',
2805 'IO': 'British Indian Ocean Territory',
2806 'BN': 'Brunei Darussalam',
2808 'BF': 'Burkina Faso',
2814 'KY': 'Cayman Islands',
2815 'CF': 'Central African Republic',
2819 'CX': 'Christmas Island',
2820 'CC': 'Cocos (Keeling) Islands',
2824 'CD': 'Congo, the Democratic Republic of the',
2825 'CK': 'Cook Islands',
2827 'CI': 'Côte d\'Ivoire',
2832 'CZ': 'Czech Republic',
2836 'DO': 'Dominican Republic',
2839 'SV': 'El Salvador',
2840 'GQ': 'Equatorial Guinea',
2844 'FK': 'Falkland Islands (Malvinas)',
2845 'FO': 'Faroe Islands',
2849 'GF': 'French Guiana',
2850 'PF': 'French Polynesia',
2851 'TF': 'French Southern Territories',
2866 'GW': 'Guinea-Bissau',
2869 'HM': 'Heard Island and McDonald Islands',
2870 'VA': 'Holy See (Vatican City State)',
2877 'IR': 'Iran, Islamic Republic of',
2880 'IM': 'Isle of Man',
2890 'KP': 'Korea, Democratic People\'s Republic of',
2891 'KR': 'Korea, Republic of',
2894 'LA': 'Lao People\'s Democratic Republic',
2900 'LI': 'Liechtenstein',
2904 'MK': 'Macedonia, the Former Yugoslav Republic of',
2911 'MH': 'Marshall Islands',
2917 'FM': 'Micronesia, Federated States of',
2918 'MD': 'Moldova, Republic of',
2929 'NL': 'Netherlands',
2930 'NC': 'New Caledonia',
2931 'NZ': 'New Zealand',
2936 'NF': 'Norfolk Island',
2937 'MP': 'Northern Mariana Islands',
2942 'PS': 'Palestine, State of',
2944 'PG': 'Papua New Guinea',
2947 'PH': 'Philippines',
2951 'PR': 'Puerto Rico',
2955 'RU': 'Russian Federation',
2957 'BL': 'Saint Barthélemy',
2958 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2959 'KN': 'Saint Kitts and Nevis',
2960 'LC': 'Saint Lucia',
2961 'MF': 'Saint Martin (French part)',
2962 'PM': 'Saint Pierre and Miquelon',
2963 'VC': 'Saint Vincent and the Grenadines',
2966 'ST': 'Sao Tome and Principe',
2967 'SA': 'Saudi Arabia',
2971 'SL': 'Sierra Leone',
2973 'SX': 'Sint Maarten (Dutch part)',
2976 'SB': 'Solomon Islands',
2978 'ZA': 'South Africa',
2979 'GS': 'South Georgia and the South Sandwich Islands',
2980 'SS': 'South Sudan',
2985 'SJ': 'Svalbard and Jan Mayen',
2988 'CH': 'Switzerland',
2989 'SY': 'Syrian Arab Republic',
2990 'TW': 'Taiwan, Province of China',
2992 'TZ': 'Tanzania, United Republic of',
2994 'TL': 'Timor-Leste',
2998 'TT': 'Trinidad and Tobago',
3001 'TM': 'Turkmenistan',
3002 'TC': 'Turks and Caicos Islands',
3006 'AE': 'United Arab Emirates',
3007 'GB': 'United Kingdom',
3008 'US': 'United States',
3009 'UM': 'United States Minor Outlying Islands',
3013 'VE': 'Venezuela, Bolivarian Republic of',
3015 'VG': 'Virgin Islands, British',
3016 'VI': 'Virgin Islands, U.S.',
3017 'WF': 'Wallis and Futuna',
3018 'EH': 'Western Sahara',
3025 def short2full(cls, code):
3026 """Convert an ISO 3166-2 country code to the corresponding full name"""
3027 return cls._country_map.get(code.upper())
3030 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3031 def __init__(self, proxies=None):
3032 # Set default handlers
3033 for type in ('http', 'https'):
3034 setattr(self, '%s_open' % type,
3035 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3036 meth(r, proxy, type))
3037 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3039 def proxy_open(self, req, proxy, type):
3040 req_proxy = req.headers.get('Ytdl-request-proxy')
3041 if req_proxy is not None:
3043 del req.headers['Ytdl-request-proxy']
3045 if proxy == '__noproxy__':
3046 return None # No Proxy
3047 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3048 req.add_header('Ytdl-socks-proxy', proxy)
3049 # youtube-dl's http/https handlers do wrapping the socket with socks
3051 return compat_urllib_request.ProxyHandler.proxy_open(
3052 self, req, proxy, type)
3055 def ohdave_rsa_encrypt(data, exponent, modulus):
3057 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3060 data: data to encrypt, bytes-like object
3061 exponent, modulus: parameter e and N of RSA algorithm, both integer
3062 Output: hex string of encrypted data
3064 Limitation: supports one block encryption only
3067 payload = int(binascii.hexlify(data[::-1]), 16)
3068 encrypted = pow(payload, exponent, modulus)
3069 return '%x' % encrypted
3072 def encode_base_n(num, n, table=None):
3073 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3075 table = FULL_TABLE[:n]
3078 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3085 ret = table[num % n] + ret
3090 def decode_packed_codes(code):
3091 mobj = re.search(PACKED_CODES_RE, code)
3092 obfucasted_code, base, count, symbols = mobj.groups()
3095 symbols = symbols.split('|')
3100 base_n_count = encode_base_n(count, base)
3101 symbol_table[base_n_count] = symbols[count] or base_n_count
3104 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3108 def parse_m3u8_attributes(attrib):
3110 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3111 if val.startswith('"'):
3117 def urshift(val, n):
3118 return val >> n if val >= 0 else (val + 0x100000000) >> n
3121 # Based on png2str() written by @gdkchan and improved by @yokrysty
3122 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3123 def decode_png(png_data):
3124 # Reference: https://www.w3.org/TR/PNG/
3125 header = png_data[8:]
3127 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3128 raise IOError('Not a valid PNG file.')
3130 int_map = {1: '>B', 2: '>H', 4: '>I'}
3131 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3136 length = unpack_integer(header[:4])
3139 chunk_type = header[:4]
3142 chunk_data = header[:length]
3143 header = header[length:]
3145 header = header[4:] # Skip CRC
3153 ihdr = chunks[0]['data']
3155 width = unpack_integer(ihdr[:4])
3156 height = unpack_integer(ihdr[4:8])
3160 for chunk in chunks:
3161 if chunk['type'] == b'IDAT':
3162 idat += chunk['data']
3165 raise IOError('Unable to read PNG data.')
3167 decompressed_data = bytearray(zlib.decompress(idat))
3172 def _get_pixel(idx):
3177 for y in range(height):
3178 basePos = y * (1 + stride)
3179 filter_type = decompressed_data[basePos]
3183 pixels.append(current_row)
3185 for x in range(stride):
3186 color = decompressed_data[1 + basePos + x]
3187 basex = y * stride + x
3192 left = _get_pixel(basex - 3)
3194 up = _get_pixel(basex - stride)
3196 if filter_type == 1: # Sub
3197 color = (color + left) & 0xff
3198 elif filter_type == 2: # Up
3199 color = (color + up) & 0xff
3200 elif filter_type == 3: # Average
3201 color = (color + ((left + up) >> 1)) & 0xff
3202 elif filter_type == 4: # Paeth
3208 c = _get_pixel(basex - stride - 3)
3216 if pa <= pb and pa <= pc:
3217 color = (color + a) & 0xff
3219 color = (color + b) & 0xff
3221 color = (color + c) & 0xff
3223 current_row.append(color)
3225 return width, height, pixels
3228 def write_xattr(path, key, value):
3229 # This mess below finds the best xattr tool for the job
3231 # try the pyxattr module...
3234 if hasattr(xattr, 'set'): # pyxattr
3235 # Unicode arguments are not supported in python-pyxattr until
3237 # See https://github.com/rg3/youtube-dl/issues/5498
3238 pyxattr_required_version = '0.5.0'
3239 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3240 # TODO: fallback to CLI tools
3241 raise XAttrUnavailableError(
3242 'python-pyxattr is detected but is too old. '
3243 'youtube-dl requires %s or above while your version is %s. '
3244 'Falling back to other xattr implementations' % (
3245 pyxattr_required_version, xattr.__version__))
3247 setxattr = xattr.set
3249 setxattr = xattr.setxattr
3252 setxattr(path, key, value)
3253 except EnvironmentError as e:
3254 raise XAttrMetadataError(e.errno, e.strerror)
3257 if compat_os_name == 'nt':
3258 # Write xattrs to NTFS Alternate Data Streams:
3259 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3260 assert ':' not in key
3261 assert os.path.exists(path)
3263 ads_fn = path + ':' + key
3265 with open(ads_fn, 'wb') as f:
3267 except EnvironmentError as e:
3268 raise XAttrMetadataError(e.errno, e.strerror)
3270 user_has_setfattr = check_executable('setfattr', ['--version'])
3271 user_has_xattr = check_executable('xattr', ['-h'])
3273 if user_has_setfattr or user_has_xattr:
3275 value = value.decode('utf-8')
3276 if user_has_setfattr:
3277 executable = 'setfattr'
3278 opts = ['-n', key, '-v', value]
3279 elif user_has_xattr:
3280 executable = 'xattr'
3281 opts = ['-w', key, value]
3283 cmd = ([encodeFilename(executable, True)] +
3284 [encodeArgument(o) for o in opts] +
3285 [encodeFilename(path, True)])
3288 p = subprocess.Popen(
3289 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3290 except EnvironmentError as e:
3291 raise XAttrMetadataError(e.errno, e.strerror)
3292 stdout, stderr = p.communicate()
3293 stderr = stderr.decode('utf-8', 'replace')
3294 if p.returncode != 0:
3295 raise XAttrMetadataError(p.returncode, stderr)
3298 # On Unix, and can't find pyxattr, setfattr, or xattr.
3299 if sys.platform.startswith('linux'):
3300 raise XAttrUnavailableError(
3301 "Couldn't find a tool to set the xattrs. "
3302 "Install either the python 'pyxattr' or 'xattr' "
3303 "modules, or the GNU 'attr' package "
3304 "(which contains the 'setfattr' tool).")
3306 raise XAttrUnavailableError(
3307 "Couldn't find a tool to set the xattrs. "
3308 "Install either the python 'xattr' module, "
3309 "or the 'xattr' binary.")