4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
42 compat_html_entities_html5,
48 compat_socket_create_connection,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
91 ENGLISH_MONTH_NAMES = [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
96 'en': ENGLISH_MONTH_NAMES,
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
115 'f4f', 'f4m', 'm3u8', 'smil')
117 # needed for sanitizing filenames in restricted mode
118 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
136 '%Y-%m-%d %H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M:%SZ',
140 '%Y-%m-%dT%H:%M:%S.%fZ',
141 '%Y-%m-%dT%H:%M:%S.%f0Z',
143 '%Y-%m-%dT%H:%M:%S.%f',
146 '%b %d %Y at %H:%M:%S',
149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150 DATE_FORMATS_DAY_FIRST.extend([
159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160 DATE_FORMATS_MONTH_FIRST.extend([
168 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
171 def preferredencoding():
172 """Get preferred encoding.
174 Returns the best encoding scheme for the system, based on
175 locale.getpreferredencoding() and some further tweaks.
178 pref = locale.getpreferredencoding()
186 def write_json_file(obj, fn):
187 """ Encode obj as JSON and write it to fn, atomically if possible """
189 fn = encodeFilename(fn)
190 if sys.version_info < (3, 0) and sys.platform != 'win32':
191 encoding = get_filesystem_encoding()
192 # os.path.basename returns a bytes object, but NamedTemporaryFile
193 # will fail if the filename contains non ascii characters unless we
194 # use a unicode object
195 path_basename = lambda f: os.path.basename(fn).decode(encoding)
196 # the same for os.path.dirname
197 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
199 path_basename = os.path.basename
200 path_dirname = os.path.dirname
204 'prefix': path_basename(fn) + '.',
205 'dir': path_dirname(fn),
209 # In Python 2.x, json.dump expects a bytestream.
210 # In Python 3.x, it writes to a character stream
211 if sys.version_info < (3, 0):
219 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
224 if sys.platform == 'win32':
225 # Need to remove existing file on Windows, else os.rename raises
226 # WindowsError or FileExistsError.
231 os.rename(tf.name, fn)
240 if sys.version_info >= (2, 7):
241 def find_xpath_attr(node, xpath, key, val=None):
242 """ Find the xpath xpath[@key=val] """
243 assert re.match(r'^[a-zA-Z_-]+$', key)
244 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
245 return node.find(expr)
247 def find_xpath_attr(node, xpath, key, val=None):
248 for f in node.findall(compat_xpath(xpath)):
249 if key not in f.attrib:
251 if val is None or f.attrib.get(key) == val:
255 # On python2.6 the xml.etree.ElementTree.Element methods don't support
256 # the namespace parameter
259 def xpath_with_ns(path, ns_map):
260 components = [c.split(':') for c in path.split('/')]
264 replaced.append(c[0])
267 replaced.append('{%s}%s' % (ns_map[ns], tag))
268 return '/'.join(replaced)
271 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
272 def _find_xpath(xpath):
273 return node.find(compat_xpath(xpath))
275 if isinstance(xpath, (str, compat_str)):
276 n = _find_xpath(xpath)
284 if default is not NO_DEFAULT:
287 name = xpath if name is None else name
288 raise ExtractorError('Could not find XML element %s' % name)
294 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
295 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
296 if n is None or n == default:
299 if default is not NO_DEFAULT:
302 name = xpath if name is None else name
303 raise ExtractorError('Could not find XML element\'s text %s' % name)
309 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
310 n = find_xpath_attr(node, xpath, key)
312 if default is not NO_DEFAULT:
315 name = '%s[@%s]' % (xpath, key) if name is None else name
316 raise ExtractorError('Could not find XML attribute %s' % name)
322 def get_element_by_id(id, html):
323 """Return the content of the tag with the specified ID in the passed HTML document"""
324 return get_element_by_attribute('id', id, html)
327 def get_element_by_class(class_name, html):
328 return get_element_by_attribute(
329 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
330 html, escape_value=False)
333 def get_element_by_attribute(attribute, value, html, escape_value=True):
334 """Return the content of the tag with the specified attribute in the passed HTML document"""
336 value = re.escape(value) if escape_value else value
338 m = re.search(r'''(?xs)
340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
342 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
346 ''' % (re.escape(attribute), value), html)
350 res = m.group('content')
352 if res.startswith('"') or res.startswith("'"):
355 return unescapeHTML(res)
358 class HTMLAttributeParser(compat_HTMLParser):
359 """Trivial HTML parser to gather the attributes for a single element"""
362 compat_HTMLParser.__init__(self)
364 def handle_starttag(self, tag, attrs):
365 self.attrs = dict(attrs)
368 def extract_attributes(html_element):
369 """Given a string for an HTML element such as
371 a="foo" B="bar" c="&98;az" d=boz
372 empty= noval entity="&"
375 Decode and return a dictionary of attributes.
377 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
378 'empty': '', 'noval': None, 'entity': '&',
379 'sq': '"', 'dq': '\''
381 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
382 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
384 parser = HTMLAttributeParser()
385 parser.feed(html_element)
390 def clean_html(html):
391 """Clean an HTML snippet into a readable string"""
393 if html is None: # Convenience for sanitizing descriptions etc.
397 html = html.replace('\n', ' ')
398 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
399 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
401 html = re.sub('<.*?>', '', html)
402 # Replace html entities
403 html = unescapeHTML(html)
407 def sanitize_open(filename, open_mode):
408 """Try to open the given filename, and slightly tweak it if this fails.
410 Attempts to open the given filename. If this fails, it tries to change
411 the filename slightly, step by step, until it's either able to open it
412 or it fails and raises a final exception, like the standard open()
415 It returns the tuple (stream, definitive_file_name).
419 if sys.platform == 'win32':
421 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
422 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
423 stream = open(encodeFilename(filename), open_mode)
424 return (stream, filename)
425 except (IOError, OSError) as err:
426 if err.errno in (errno.EACCES,):
429 # In case of error, try to remove win32 forbidden chars
430 alt_filename = sanitize_path(filename)
431 if alt_filename == filename:
434 # An exception here should be caught in the caller
435 stream = open(encodeFilename(alt_filename), open_mode)
436 return (stream, alt_filename)
439 def timeconvert(timestr):
440 """Convert RFC 2822 defined time string into system timestamp"""
442 timetuple = email.utils.parsedate_tz(timestr)
443 if timetuple is not None:
444 timestamp = email.utils.mktime_tz(timetuple)
448 def sanitize_filename(s, restricted=False, is_id=False):
449 """Sanitizes a string so it could be used as part of a filename.
450 If restricted is set, use a stricter subset of allowed characters.
451 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
453 def replace_insane(char):
454 if restricted and char in ACCENT_CHARS:
455 return ACCENT_CHARS[char]
456 if char == '?' or ord(char) < 32 or ord(char) == 127:
459 return '' if restricted else '\''
461 return '_-' if restricted else ' -'
462 elif char in '\\/|*<>':
464 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
466 if restricted and ord(char) > 127:
471 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
472 result = ''.join(map(replace_insane, s))
474 while '__' in result:
475 result = result.replace('__', '_')
476 result = result.strip('_')
477 # Common case of "Foreign band name - English song title"
478 if restricted and result.startswith('-_'):
480 if result.startswith('-'):
481 result = '_' + result[len('-'):]
482 result = result.lstrip('.')
488 def sanitize_path(s):
489 """Sanitizes and normalizes path on Windows"""
490 if sys.platform != 'win32':
492 drive_or_unc, _ = os.path.splitdrive(s)
493 if sys.version_info < (2, 7) and not drive_or_unc:
494 drive_or_unc, _ = os.path.splitunc(s)
495 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
499 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
500 for path_part in norm_path]
502 sanitized_path.insert(0, drive_or_unc + os.path.sep)
503 return os.path.join(*sanitized_path)
506 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
507 # unwanted failures due to missing protocol
508 def sanitize_url(url):
509 return 'http:%s' % url if url.startswith('//') else url
512 def sanitized_Request(url, *args, **kwargs):
513 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
516 def orderedSet(iterable):
517 """ Remove all duplicates from the input iterable """
525 def _htmlentity_transform(entity_with_semicolon):
526 """Transforms an HTML entity to a character."""
527 entity = entity_with_semicolon[:-1]
529 # Known non-numeric HTML entity
530 if entity in compat_html_entities.name2codepoint:
531 return compat_chr(compat_html_entities.name2codepoint[entity])
533 # TODO: HTML5 allows entities without a semicolon. For example,
534 # 'Éric' should be decoded as 'Éric'.
535 if entity_with_semicolon in compat_html_entities_html5:
536 return compat_html_entities_html5[entity_with_semicolon]
538 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
540 numstr = mobj.group(1)
541 if numstr.startswith('x'):
543 numstr = '0%s' % numstr
546 # See https://github.com/rg3/youtube-dl/issues/7518
548 return compat_chr(int(numstr, base))
552 # Unknown entity in name, return its literal representation
553 return '&%s;' % entity
559 assert type(s) == compat_str
562 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
565 def get_subprocess_encoding():
566 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
567 # For subprocess calls, encode with locale encoding
568 # Refer to http://stackoverflow.com/a/9951851/35070
569 encoding = preferredencoding()
571 encoding = sys.getfilesystemencoding()
577 def encodeFilename(s, for_subprocess=False):
579 @param s The name of the file
582 assert type(s) == compat_str
584 # Python 3 has a Unicode API
585 if sys.version_info >= (3, 0):
588 # Pass '' directly to use Unicode APIs on Windows 2000 and up
589 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
590 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
591 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
594 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
595 if sys.platform.startswith('java'):
598 return s.encode(get_subprocess_encoding(), 'ignore')
601 def decodeFilename(b, for_subprocess=False):
603 if sys.version_info >= (3, 0):
606 if not isinstance(b, bytes):
609 return b.decode(get_subprocess_encoding(), 'ignore')
612 def encodeArgument(s):
613 if not isinstance(s, compat_str):
614 # Legacy code that uses byte strings
615 # Uncomment the following line after fixing all post processors
616 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
617 s = s.decode('ascii')
618 return encodeFilename(s, True)
621 def decodeArgument(b):
622 return decodeFilename(b, True)
625 def decodeOption(optval):
628 if isinstance(optval, bytes):
629 optval = optval.decode(preferredencoding())
631 assert isinstance(optval, compat_str)
635 def formatSeconds(secs):
637 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
639 return '%d:%02d' % (secs // 60, secs % 60)
644 def make_HTTPS_handler(params, **kwargs):
645 opts_no_check_certificate = params.get('nocheckcertificate', False)
646 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
647 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
648 if opts_no_check_certificate:
649 context.check_hostname = False
650 context.verify_mode = ssl.CERT_NONE
652 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
655 # (create_default_context present but HTTPSHandler has no context=)
658 if sys.version_info < (3, 2):
659 return YoutubeDLHTTPSHandler(params, **kwargs)
661 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
662 context.verify_mode = (ssl.CERT_NONE
663 if opts_no_check_certificate
664 else ssl.CERT_REQUIRED)
665 context.set_default_verify_paths()
666 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
669 def bug_reports_message():
670 if ytdl_is_updateable():
671 update_cmd = 'type youtube-dl -U to update'
673 update_cmd = 'see https://yt-dl.org/update on how to update'
674 msg = '; please report this issue on https://yt-dl.org/bug .'
675 msg += ' Make sure you are using the latest version; %s.' % update_cmd
676 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
680 class ExtractorError(Exception):
681 """Error during info extraction."""
683 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
684 """ tb, if given, is the original traceback (so that it can be printed out).
685 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
688 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
690 if video_id is not None:
691 msg = video_id + ': ' + msg
693 msg += ' (caused by %r)' % cause
695 msg += bug_reports_message()
696 super(ExtractorError, self).__init__(msg)
699 self.exc_info = sys.exc_info() # preserve original exception
701 self.video_id = video_id
703 def format_traceback(self):
704 if self.traceback is None:
706 return ''.join(traceback.format_tb(self.traceback))
709 class UnsupportedError(ExtractorError):
710 def __init__(self, url):
711 super(UnsupportedError, self).__init__(
712 'Unsupported URL: %s' % url, expected=True)
716 class RegexNotFoundError(ExtractorError):
717 """Error when a regex didn't match"""
721 class DownloadError(Exception):
722 """Download Error exception.
724 This exception may be thrown by FileDownloader objects if they are not
725 configured to continue on errors. They will contain the appropriate
729 def __init__(self, msg, exc_info=None):
730 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
731 super(DownloadError, self).__init__(msg)
732 self.exc_info = exc_info
735 class SameFileError(Exception):
736 """Same File exception.
738 This exception will be thrown by FileDownloader objects if they detect
739 multiple files would have to be downloaded to the same file on disk.
744 class PostProcessingError(Exception):
745 """Post Processing exception.
747 This exception may be raised by PostProcessor's .run() method to
748 indicate an error in the postprocessing task.
751 def __init__(self, msg):
755 class MaxDownloadsReached(Exception):
756 """ --max-downloads limit has been reached. """
760 class UnavailableVideoError(Exception):
761 """Unavailable Format exception.
763 This exception will be thrown when a video is requested
764 in a format that is not available for that video.
769 class ContentTooShortError(Exception):
770 """Content Too Short exception.
772 This exception may be raised by FileDownloader objects when a file they
773 download is too small for what the server announced first, indicating
774 the connection was probably interrupted.
777 def __init__(self, downloaded, expected):
779 self.downloaded = downloaded
780 self.expected = expected
783 class XAttrMetadataError(Exception):
784 def __init__(self, code=None, msg='Unknown error'):
785 super(XAttrMetadataError, self).__init__(msg)
789 # Parsing code and msg
790 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
791 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
792 self.reason = 'NO_SPACE'
793 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
794 self.reason = 'VALUE_TOO_LONG'
796 self.reason = 'NOT_SUPPORTED'
799 class XAttrUnavailableError(Exception):
803 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
804 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
805 # expected HTTP responses to meet HTTP/1.0 or later (see also
806 # https://github.com/rg3/youtube-dl/issues/6727)
807 if sys.version_info < (3, 0):
808 kwargs[b'strict'] = True
809 hc = http_class(*args, **kwargs)
810 source_address = ydl_handler._params.get('source_address')
811 if source_address is not None:
812 sa = (source_address, 0)
813 if hasattr(hc, 'source_address'): # Python 2.7+
814 hc.source_address = sa
816 def _hc_connect(self, *args, **kwargs):
817 sock = compat_socket_create_connection(
818 (self.host, self.port), self.timeout, sa)
820 self.sock = ssl.wrap_socket(
821 sock, self.key_file, self.cert_file,
822 ssl_version=ssl.PROTOCOL_TLSv1)
825 hc.connect = functools.partial(_hc_connect, hc)
830 def handle_youtubedl_headers(headers):
831 filtered_headers = headers
833 if 'Youtubedl-no-compression' in filtered_headers:
834 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
835 del filtered_headers['Youtubedl-no-compression']
837 return filtered_headers
840 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
841 """Handler for HTTP requests and responses.
843 This class, when installed with an OpenerDirector, automatically adds
844 the standard headers to every HTTP request and handles gzipped and
845 deflated responses from web servers. If compression is to be avoided in
846 a particular request, the original request in the program code only has
847 to include the HTTP header "Youtubedl-no-compression", which will be
848 removed before making the real request.
850 Part of this code was copied from:
852 http://techknack.net/python-urllib2-handlers/
854 Andrew Rowls, the author of that code, agreed to release it to the
858 def __init__(self, params, *args, **kwargs):
859 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
860 self._params = params
862 def http_open(self, req):
863 conn_class = compat_http_client.HTTPConnection
865 socks_proxy = req.headers.get('Ytdl-socks-proxy')
867 conn_class = make_socks_conn_class(conn_class, socks_proxy)
868 del req.headers['Ytdl-socks-proxy']
870 return self.do_open(functools.partial(
871 _create_http_connection, self, conn_class, False),
877 return zlib.decompress(data, -zlib.MAX_WBITS)
879 return zlib.decompress(data)
882 def addinfourl_wrapper(stream, headers, url, code):
883 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
884 return compat_urllib_request.addinfourl(stream, headers, url, code)
885 ret = compat_urllib_request.addinfourl(stream, headers, url)
889 def http_request(self, req):
890 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
891 # always respected by websites, some tend to give out URLs with non percent-encoded
892 # non-ASCII characters (see telemb.py, ard.py [#3412])
893 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
894 # To work around aforementioned issue we will replace request's original URL with
895 # percent-encoded one
896 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
897 # the code of this workaround has been moved here from YoutubeDL.urlopen()
898 url = req.get_full_url()
899 url_escaped = escape_url(url)
901 # Substitute URL if any change after escaping
902 if url != url_escaped:
903 req = update_Request(req, url=url_escaped)
905 for h, v in std_headers.items():
906 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
907 # The dict keys are capitalized because of this bug by urllib
908 if h.capitalize() not in req.headers:
911 req.headers = handle_youtubedl_headers(req.headers)
913 if sys.version_info < (2, 7) and '#' in req.get_full_url():
914 # Python 2.6 is brain-dead when it comes to fragments
915 req._Request__original = req._Request__original.partition('#')[0]
916 req._Request__r_type = req._Request__r_type.partition('#')[0]
920 def http_response(self, req, resp):
923 if resp.headers.get('Content-encoding', '') == 'gzip':
924 content = resp.read()
925 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
927 uncompressed = io.BytesIO(gz.read())
928 except IOError as original_ioerror:
929 # There may be junk add the end of the file
930 # See http://stackoverflow.com/q/4928560/35070 for details
931 for i in range(1, 1024):
933 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
934 uncompressed = io.BytesIO(gz.read())
939 raise original_ioerror
940 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
941 resp.msg = old_resp.msg
942 del resp.headers['Content-encoding']
944 if resp.headers.get('Content-encoding', '') == 'deflate':
945 gz = io.BytesIO(self.deflate(resp.read()))
946 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
947 resp.msg = old_resp.msg
948 del resp.headers['Content-encoding']
949 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
950 # https://github.com/rg3/youtube-dl/issues/6457).
951 if 300 <= resp.code < 400:
952 location = resp.headers.get('Location')
954 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
955 if sys.version_info >= (3, 0):
956 location = location.encode('iso-8859-1').decode('utf-8')
958 location = location.decode('utf-8')
959 location_escaped = escape_url(location)
960 if location != location_escaped:
961 del resp.headers['Location']
962 if sys.version_info < (3, 0):
963 location_escaped = location_escaped.encode('utf-8')
964 resp.headers['Location'] = location_escaped
967 https_request = http_request
968 https_response = http_response
971 def make_socks_conn_class(base_class, socks_proxy):
972 assert issubclass(base_class, (
973 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
975 url_components = compat_urlparse.urlparse(socks_proxy)
976 if url_components.scheme.lower() == 'socks5':
977 socks_type = ProxyType.SOCKS5
978 elif url_components.scheme.lower() in ('socks', 'socks4'):
979 socks_type = ProxyType.SOCKS4
980 elif url_components.scheme.lower() == 'socks4a':
981 socks_type = ProxyType.SOCKS4A
983 def unquote_if_non_empty(s):
986 return compat_urllib_parse_unquote_plus(s)
990 url_components.hostname, url_components.port or 1080,
992 unquote_if_non_empty(url_components.username),
993 unquote_if_non_empty(url_components.password),
996 class SocksConnection(base_class):
998 self.sock = sockssocket()
999 self.sock.setproxy(*proxy_args)
1000 if type(self.timeout) in (int, float):
1001 self.sock.settimeout(self.timeout)
1002 self.sock.connect((self.host, self.port))
1004 if isinstance(self, compat_http_client.HTTPSConnection):
1005 if hasattr(self, '_context'): # Python > 2.6
1006 self.sock = self._context.wrap_socket(
1007 self.sock, server_hostname=self.host)
1009 self.sock = ssl.wrap_socket(self.sock)
1011 return SocksConnection
1014 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1015 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1016 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1017 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1018 self._params = params
1020 def https_open(self, req):
1022 conn_class = self._https_conn_class
1024 if hasattr(self, '_context'): # python > 2.6
1025 kwargs['context'] = self._context
1026 if hasattr(self, '_check_hostname'): # python 3.x
1027 kwargs['check_hostname'] = self._check_hostname
1029 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1031 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1032 del req.headers['Ytdl-socks-proxy']
1034 return self.do_open(functools.partial(
1035 _create_http_connection, self, conn_class, True),
1039 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1040 def __init__(self, cookiejar=None):
1041 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1043 def http_response(self, request, response):
1044 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1045 # characters in Set-Cookie HTTP header of last response (see
1046 # https://github.com/rg3/youtube-dl/issues/6769).
1047 # In order to at least prevent crashing we will percent encode Set-Cookie
1048 # header before HTTPCookieProcessor starts processing it.
1049 # if sys.version_info < (3, 0) and response.headers:
1050 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1051 # set_cookie = response.headers.get(set_cookie_header)
1053 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1054 # if set_cookie != set_cookie_escaped:
1055 # del response.headers[set_cookie_header]
1056 # response.headers[set_cookie_header] = set_cookie_escaped
1057 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1059 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1060 https_response = http_response
1063 def extract_timezone(date_str):
1065 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1068 timezone = datetime.timedelta()
1070 date_str = date_str[:-len(m.group('tz'))]
1071 if not m.group('sign'):
1072 timezone = datetime.timedelta()
1074 sign = 1 if m.group('sign') == '+' else -1
1075 timezone = datetime.timedelta(
1076 hours=sign * int(m.group('hours')),
1077 minutes=sign * int(m.group('minutes')))
1078 return timezone, date_str
1081 def parse_iso8601(date_str, delimiter='T', timezone=None):
1082 """ Return a UNIX timestamp from the given date """
1084 if date_str is None:
1087 date_str = re.sub(r'\.[0-9]+', '', date_str)
1089 if timezone is None:
1090 timezone, date_str = extract_timezone(date_str)
1093 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1094 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1095 return calendar.timegm(dt.timetuple())
1100 def date_formats(day_first=True):
1101 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1104 def unified_strdate(date_str, day_first=True):
1105 """Return a string with the date in the format YYYYMMDD"""
1107 if date_str is None:
1111 date_str = date_str.replace(',', ' ')
1112 # Remove AM/PM + timezone
1113 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1114 _, date_str = extract_timezone(date_str)
1116 for expression in date_formats(day_first):
1118 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1121 if upload_date is None:
1122 timetuple = email.utils.parsedate_tz(date_str)
1125 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1128 if upload_date is not None:
1129 return compat_str(upload_date)
1132 def unified_timestamp(date_str, day_first=True):
1133 if date_str is None:
1136 date_str = date_str.replace(',', ' ')
1138 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1139 timezone, date_str = extract_timezone(date_str)
1141 # Remove AM/PM + timezone
1142 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1144 for expression in date_formats(day_first):
1146 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1147 return calendar.timegm(dt.timetuple())
1150 timetuple = email.utils.parsedate_tz(date_str)
1152 return calendar.timegm(timetuple) + pm_delta * 3600
1155 def determine_ext(url, default_ext='unknown_video'):
1158 guess = url.partition('?')[0].rpartition('.')[2]
1159 if re.match(r'^[A-Za-z0-9]+$', guess):
1161 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1162 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1163 return guess.rstrip('/')
1168 def subtitles_filename(filename, sub_lang, sub_format):
1169 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1172 def date_from_str(date_str):
1174 Return a datetime object from a string in the format YYYYMMDD or
1175 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1176 today = datetime.date.today()
1177 if date_str in ('now', 'today'):
1179 if date_str == 'yesterday':
1180 return today - datetime.timedelta(days=1)
1181 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1182 if match is not None:
1183 sign = match.group('sign')
1184 time = int(match.group('time'))
1187 unit = match.group('unit')
1188 # A bad approximation?
1192 elif unit == 'year':
1196 delta = datetime.timedelta(**{unit: time})
1197 return today + delta
1198 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1201 def hyphenate_date(date_str):
1203 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1204 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1205 if match is not None:
1206 return '-'.join(match.groups())
1211 class DateRange(object):
1212 """Represents a time interval between two dates"""
1214 def __init__(self, start=None, end=None):
1215 """start and end must be strings in the format accepted by date"""
1216 if start is not None:
1217 self.start = date_from_str(start)
1219 self.start = datetime.datetime.min.date()
1221 self.end = date_from_str(end)
1223 self.end = datetime.datetime.max.date()
1224 if self.start > self.end:
1225 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1229 """Returns a range that only contains the given day"""
1230 return cls(day, day)
1232 def __contains__(self, date):
1233 """Check if the date is in the range"""
1234 if not isinstance(date, datetime.date):
1235 date = date_from_str(date)
1236 return self.start <= date <= self.end
1239 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1242 def platform_name():
1243 """ Returns the platform name as a compat_str """
1244 res = platform.platform()
1245 if isinstance(res, bytes):
1246 res = res.decode(preferredencoding())
1248 assert isinstance(res, compat_str)
1252 def _windows_write_string(s, out):
1253 """ Returns True if the string was written using special methods,
1254 False if it has yet to be written out."""
1255 # Adapted from http://stackoverflow.com/a/3259271/35070
1258 import ctypes.wintypes
1266 fileno = out.fileno()
1267 except AttributeError:
1268 # If the output stream doesn't have a fileno, it's virtual
1270 except io.UnsupportedOperation:
1271 # Some strange Windows pseudo files?
1273 if fileno not in WIN_OUTPUT_IDS:
1276 GetStdHandle = ctypes.WINFUNCTYPE(
1277 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1278 (b'GetStdHandle', ctypes.windll.kernel32))
1279 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1281 WriteConsoleW = ctypes.WINFUNCTYPE(
1282 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1283 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1284 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1285 written = ctypes.wintypes.DWORD(0)
1287 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1288 FILE_TYPE_CHAR = 0x0002
1289 FILE_TYPE_REMOTE = 0x8000
1290 GetConsoleMode = ctypes.WINFUNCTYPE(
1291 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1292 ctypes.POINTER(ctypes.wintypes.DWORD))(
1293 (b'GetConsoleMode', ctypes.windll.kernel32))
1294 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1296 def not_a_console(handle):
1297 if handle == INVALID_HANDLE_VALUE or handle is None:
1299 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1300 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1302 if not_a_console(h):
1305 def next_nonbmp_pos(s):
1307 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1308 except StopIteration:
1312 count = min(next_nonbmp_pos(s), 1024)
1314 ret = WriteConsoleW(
1315 h, s, count if count else 2, ctypes.byref(written), None)
1317 raise OSError('Failed to write string')
1318 if not count: # We just wrote a non-BMP character
1319 assert written.value == 2
1322 assert written.value > 0
1323 s = s[written.value:]
1327 def write_string(s, out=None, encoding=None):
1330 assert type(s) == compat_str
1332 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1333 if _windows_write_string(s, out):
1336 if ('b' in getattr(out, 'mode', '') or
1337 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1338 byt = s.encode(encoding or preferredencoding(), 'ignore')
1340 elif hasattr(out, 'buffer'):
1341 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1342 byt = s.encode(enc, 'ignore')
1343 out.buffer.write(byt)
1349 def bytes_to_intlist(bs):
1352 if isinstance(bs[0], int): # Python 3
1355 return [ord(c) for c in bs]
1358 def intlist_to_bytes(xs):
1361 return compat_struct_pack('%dB' % len(xs), *xs)
1364 # Cross-platform file locking
1365 if sys.platform == 'win32':
1366 import ctypes.wintypes
1369 class OVERLAPPED(ctypes.Structure):
1371 ('Internal', ctypes.wintypes.LPVOID),
1372 ('InternalHigh', ctypes.wintypes.LPVOID),
1373 ('Offset', ctypes.wintypes.DWORD),
1374 ('OffsetHigh', ctypes.wintypes.DWORD),
1375 ('hEvent', ctypes.wintypes.HANDLE),
1378 kernel32 = ctypes.windll.kernel32
1379 LockFileEx = kernel32.LockFileEx
1380 LockFileEx.argtypes = [
1381 ctypes.wintypes.HANDLE, # hFile
1382 ctypes.wintypes.DWORD, # dwFlags
1383 ctypes.wintypes.DWORD, # dwReserved
1384 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1385 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1386 ctypes.POINTER(OVERLAPPED) # Overlapped
1388 LockFileEx.restype = ctypes.wintypes.BOOL
1389 UnlockFileEx = kernel32.UnlockFileEx
1390 UnlockFileEx.argtypes = [
1391 ctypes.wintypes.HANDLE, # hFile
1392 ctypes.wintypes.DWORD, # dwReserved
1393 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1394 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1395 ctypes.POINTER(OVERLAPPED) # Overlapped
1397 UnlockFileEx.restype = ctypes.wintypes.BOOL
1398 whole_low = 0xffffffff
1399 whole_high = 0x7fffffff
1401 def _lock_file(f, exclusive):
1402 overlapped = OVERLAPPED()
1403 overlapped.Offset = 0
1404 overlapped.OffsetHigh = 0
1405 overlapped.hEvent = 0
1406 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1407 handle = msvcrt.get_osfhandle(f.fileno())
1408 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1409 whole_low, whole_high, f._lock_file_overlapped_p):
1410 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1412 def _unlock_file(f):
1413 assert f._lock_file_overlapped_p
1414 handle = msvcrt.get_osfhandle(f.fileno())
1415 if not UnlockFileEx(handle, 0,
1416 whole_low, whole_high, f._lock_file_overlapped_p):
1417 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1420 # Some platforms, such as Jython, is missing fcntl
1424 def _lock_file(f, exclusive):
1425 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1427 def _unlock_file(f):
1428 fcntl.flock(f, fcntl.LOCK_UN)
1430 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1432 def _lock_file(f, exclusive):
1433 raise IOError(UNSUPPORTED_MSG)
1435 def _unlock_file(f):
1436 raise IOError(UNSUPPORTED_MSG)
1439 class locked_file(object):
1440 def __init__(self, filename, mode, encoding=None):
1441 assert mode in ['r', 'a', 'w']
1442 self.f = io.open(filename, mode, encoding=encoding)
1445 def __enter__(self):
1446 exclusive = self.mode != 'r'
1448 _lock_file(self.f, exclusive)
1454 def __exit__(self, etype, value, traceback):
1456 _unlock_file(self.f)
1463 def write(self, *args):
1464 return self.f.write(*args)
1466 def read(self, *args):
1467 return self.f.read(*args)
1470 def get_filesystem_encoding():
1471 encoding = sys.getfilesystemencoding()
1472 return encoding if encoding is not None else 'utf-8'
1475 def shell_quote(args):
1477 encoding = get_filesystem_encoding()
1479 if isinstance(a, bytes):
1480 # We may get a filename encoded with 'encodeFilename'
1481 a = a.decode(encoding)
1482 quoted_args.append(pipes.quote(a))
1483 return ' '.join(quoted_args)
1486 def smuggle_url(url, data):
1487 """ Pass additional data in a URL for internal use. """
1489 url, idata = unsmuggle_url(url, {})
1491 sdata = compat_urllib_parse_urlencode(
1492 {'__youtubedl_smuggle': json.dumps(data)})
1493 return url + '#' + sdata
1496 def unsmuggle_url(smug_url, default=None):
1497 if '#__youtubedl_smuggle' not in smug_url:
1498 return smug_url, default
1499 url, _, sdata = smug_url.rpartition('#')
1500 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1501 data = json.loads(jsond)
1505 def format_bytes(bytes):
1508 if type(bytes) is str:
1509 bytes = float(bytes)
1513 exponent = int(math.log(bytes, 1024.0))
1514 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1515 converted = float(bytes) / float(1024 ** exponent)
1516 return '%.2f%s' % (converted, suffix)
1519 def lookup_unit_table(unit_table, s):
1520 units_re = '|'.join(re.escape(u) for u in unit_table)
1522 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1525 num_str = m.group('num').replace(',', '.')
1526 mult = unit_table[m.group('unit')]
1527 return int(float(num_str) * mult)
1530 def parse_filesize(s):
1534 # The lower-case forms are of course incorrect and unofficial,
1535 # but we support those too
1552 'megabytes': 1000 ** 2,
1553 'mebibytes': 1024 ** 2,
1559 'gigabytes': 1000 ** 3,
1560 'gibibytes': 1024 ** 3,
1566 'terabytes': 1000 ** 4,
1567 'tebibytes': 1024 ** 4,
1573 'petabytes': 1000 ** 5,
1574 'pebibytes': 1024 ** 5,
1580 'exabytes': 1000 ** 6,
1581 'exbibytes': 1024 ** 6,
1587 'zettabytes': 1000 ** 7,
1588 'zebibytes': 1024 ** 7,
1594 'yottabytes': 1000 ** 8,
1595 'yobibytes': 1024 ** 8,
1598 return lookup_unit_table(_UNIT_TABLE, s)
1607 if re.match(r'^[\d,.]+$', s):
1608 return str_to_int(s)
1619 return lookup_unit_table(_UNIT_TABLE, s)
1622 def month_by_name(name, lang='en'):
1623 """ Return the number of a month by (locale-independently) English name """
1625 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1628 return month_names.index(name) + 1
1633 def month_by_abbreviation(abbrev):
1634 """ Return the number of a month by (locale-independently) English
1638 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1643 def fix_xml_ampersands(xml_str):
1644 """Replace all the '&' by '&' in XML"""
1646 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1651 def setproctitle(title):
1652 assert isinstance(title, compat_str)
1654 # ctypes in Jython is not complete
1655 # http://bugs.jython.org/issue2148
1656 if sys.platform.startswith('java'):
1660 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1663 title_bytes = title.encode('utf-8')
1664 buf = ctypes.create_string_buffer(len(title_bytes))
1665 buf.value = title_bytes
1667 libc.prctl(15, buf, 0, 0, 0)
1668 except AttributeError:
1669 return # Strange libc, just skip this
1672 def remove_start(s, start):
1673 return s[len(start):] if s is not None and s.startswith(start) else s
1676 def remove_end(s, end):
1677 return s[:-len(end)] if s is not None and s.endswith(end) else s
1680 def remove_quotes(s):
1681 if s is None or len(s) < 2:
1683 for quote in ('"', "'", ):
1684 if s[0] == quote and s[-1] == quote:
1689 def url_basename(url):
1690 path = compat_urlparse.urlparse(url).path
1691 return path.strip('/').split('/')[-1]
1694 class HEADRequest(compat_urllib_request.Request):
1695 def get_method(self):
1699 class PUTRequest(compat_urllib_request.Request):
1700 def get_method(self):
1704 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1707 v = getattr(v, get_attr, None)
1713 return int(v) * invscale // scale
1718 def str_or_none(v, default=None):
1719 return default if v is None else compat_str(v)
1722 def str_to_int(int_str):
1723 """ A more relaxed version of int_or_none """
1726 int_str = re.sub(r'[,\.\+]', '', int_str)
1730 def float_or_none(v, scale=1, invscale=1, default=None):
1734 return float(v) * invscale / scale
1739 def strip_or_none(v):
1740 return None if v is None else v.strip()
1743 def parse_duration(s):
1744 if not isinstance(s, compat_basestring):
1749 days, hours, mins, secs, ms = [None] * 5
1750 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1752 days, hours, mins, secs, ms = m.groups()
1757 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1760 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1763 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1766 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1769 days, hours, mins, secs, ms = m.groups()
1771 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1773 hours, mins = m.groups()
1779 duration += float(secs)
1781 duration += float(mins) * 60
1783 duration += float(hours) * 60 * 60
1785 duration += float(days) * 24 * 60 * 60
1787 duration += float(ms)
1791 def prepend_extension(filename, ext, expected_real_ext=None):
1792 name, real_ext = os.path.splitext(filename)
1794 '{0}.{1}{2}'.format(name, ext, real_ext)
1795 if not expected_real_ext or real_ext[1:] == expected_real_ext
1796 else '{0}.{1}'.format(filename, ext))
1799 def replace_extension(filename, ext, expected_real_ext=None):
1800 name, real_ext = os.path.splitext(filename)
1801 return '{0}.{1}'.format(
1802 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1806 def check_executable(exe, args=[]):
1807 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1808 args can be a list of arguments for a short output (like -version) """
1810 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1816 def get_exe_version(exe, args=['--version'],
1817 version_re=None, unrecognized='present'):
1818 """ Returns the version of the specified executable,
1819 or False if the executable is not present """
1821 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1822 # SIGTTOU if youtube-dl is run in the background.
1823 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1824 out, _ = subprocess.Popen(
1825 [encodeArgument(exe)] + args,
1826 stdin=subprocess.PIPE,
1827 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1830 if isinstance(out, bytes): # Python 2.x
1831 out = out.decode('ascii', 'ignore')
1832 return detect_exe_version(out, version_re, unrecognized)
1835 def detect_exe_version(output, version_re=None, unrecognized='present'):
1836 assert isinstance(output, compat_str)
1837 if version_re is None:
1838 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1839 m = re.search(version_re, output)
1846 class PagedList(object):
1848 # This is only useful for tests
1849 return len(self.getslice())
1852 class OnDemandPagedList(PagedList):
1853 def __init__(self, pagefunc, pagesize, use_cache=False):
1854 self._pagefunc = pagefunc
1855 self._pagesize = pagesize
1856 self._use_cache = use_cache
1860 def getslice(self, start=0, end=None):
1862 for pagenum in itertools.count(start // self._pagesize):
1863 firstid = pagenum * self._pagesize
1864 nextfirstid = pagenum * self._pagesize + self._pagesize
1865 if start >= nextfirstid:
1870 page_results = self._cache.get(pagenum)
1871 if page_results is None:
1872 page_results = list(self._pagefunc(pagenum))
1874 self._cache[pagenum] = page_results
1877 start % self._pagesize
1878 if firstid <= start < nextfirstid
1882 ((end - 1) % self._pagesize) + 1
1883 if (end is not None and firstid <= end <= nextfirstid)
1886 if startv != 0 or endv is not None:
1887 page_results = page_results[startv:endv]
1888 res.extend(page_results)
1890 # A little optimization - if current page is not "full", ie. does
1891 # not contain page_size videos then we can assume that this page
1892 # is the last one - there are no more ids on further pages -
1893 # i.e. no need to query again.
1894 if len(page_results) + startv < self._pagesize:
1897 # If we got the whole page, but the next page is not interesting,
1898 # break out early as well
1899 if end == nextfirstid:
1904 class InAdvancePagedList(PagedList):
1905 def __init__(self, pagefunc, pagecount, pagesize):
1906 self._pagefunc = pagefunc
1907 self._pagecount = pagecount
1908 self._pagesize = pagesize
1910 def getslice(self, start=0, end=None):
1912 start_page = start // self._pagesize
1914 self._pagecount if end is None else (end // self._pagesize + 1))
1915 skip_elems = start - start_page * self._pagesize
1916 only_more = None if end is None else end - start
1917 for pagenum in range(start_page, end_page):
1918 page = list(self._pagefunc(pagenum))
1920 page = page[skip_elems:]
1922 if only_more is not None:
1923 if len(page) < only_more:
1924 only_more -= len(page)
1926 page = page[:only_more]
1933 def uppercase_escape(s):
1934 unicode_escape = codecs.getdecoder('unicode_escape')
1936 r'\\U[0-9a-fA-F]{8}',
1937 lambda m: unicode_escape(m.group(0))[0],
1941 def lowercase_escape(s):
1942 unicode_escape = codecs.getdecoder('unicode_escape')
1944 r'\\u[0-9a-fA-F]{4}',
1945 lambda m: unicode_escape(m.group(0))[0],
1949 def escape_rfc3986(s):
1950 """Escape non-ASCII characters as suggested by RFC 3986"""
1951 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1952 s = s.encode('utf-8')
1953 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1956 def escape_url(url):
1957 """Escape URL as suggested by RFC 3986"""
1958 url_parsed = compat_urllib_parse_urlparse(url)
1959 return url_parsed._replace(
1960 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1961 path=escape_rfc3986(url_parsed.path),
1962 params=escape_rfc3986(url_parsed.params),
1963 query=escape_rfc3986(url_parsed.query),
1964 fragment=escape_rfc3986(url_parsed.fragment)
1968 def read_batch_urls(batch_fd):
1970 if not isinstance(url, compat_str):
1971 url = url.decode('utf-8', 'replace')
1972 BOM_UTF8 = '\xef\xbb\xbf'
1973 if url.startswith(BOM_UTF8):
1974 url = url[len(BOM_UTF8):]
1976 if url.startswith(('#', ';', ']')):
1980 with contextlib.closing(batch_fd) as fd:
1981 return [url for url in map(fixup, fd) if url]
1984 def urlencode_postdata(*args, **kargs):
1985 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1988 def update_url_query(url, query):
1991 parsed_url = compat_urlparse.urlparse(url)
1992 qs = compat_parse_qs(parsed_url.query)
1994 return compat_urlparse.urlunparse(parsed_url._replace(
1995 query=compat_urllib_parse_urlencode(qs, True)))
1998 def update_Request(req, url=None, data=None, headers={}, query={}):
1999 req_headers = req.headers.copy()
2000 req_headers.update(headers)
2001 req_data = data or req.data
2002 req_url = update_url_query(url or req.get_full_url(), query)
2003 req_get_method = req.get_method()
2004 if req_get_method == 'HEAD':
2005 req_type = HEADRequest
2006 elif req_get_method == 'PUT':
2007 req_type = PUTRequest
2009 req_type = compat_urllib_request.Request
2011 req_url, data=req_data, headers=req_headers,
2012 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2013 if hasattr(req, 'timeout'):
2014 new_req.timeout = req.timeout
2018 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2019 if isinstance(key_or_keys, (list, tuple)):
2020 for key in key_or_keys:
2021 if key not in d or d[key] is None or skip_false_values and not d[key]:
2025 return d.get(key_or_keys, default)
2028 def try_get(src, getter, expected_type=None):
2031 except (AttributeError, KeyError, TypeError, IndexError):
2034 if expected_type is None or isinstance(v, expected_type):
2038 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2039 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2051 TV_PARENTAL_GUIDELINES = {
2061 def parse_age_limit(s):
2063 return s if 0 <= s <= 21 else None
2064 if not isinstance(s, compat_basestring):
2066 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2068 return int(m.group('age'))
2070 return US_RATINGS[s]
2071 return TV_PARENTAL_GUIDELINES.get(s)
2074 def strip_jsonp(code):
2076 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2079 def js_to_json(code):
2082 if v in ('true', 'false', 'null'):
2084 elif v.startswith('/*') or v == ',':
2087 if v[0] in ("'", '"'):
2088 v = re.sub(r'(?s)\\.|"', lambda m: {
2093 }.get(m.group(0), m.group(0)), v[1:-1])
2096 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2097 (r'^(0+[0-7]+)\s*:?$', 8),
2100 for regex, base in INTEGER_TABLE:
2101 im = re.match(regex, v)
2103 i = int(im.group(1), base)
2104 return '"%d":' % i if v.endswith(':') else '%d' % i
2108 return re.sub(r'''(?sx)
2109 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2110 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2111 /\*.*?\*/|,(?=\s*[\]}])|
2112 [a-zA-Z_][.a-zA-Z_0-9]*|
2113 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2118 def qualities(quality_ids):
2119 """ Get a numeric quality value out of a list of possible values """
2122 return quality_ids.index(qid)
2128 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2131 def limit_length(s, length):
2132 """ Add ellipses to overly long strings """
2137 return s[:length - len(ELLIPSES)] + ELLIPSES
2141 def version_tuple(v):
2142 return tuple(int(e) for e in re.split(r'[-.]', v))
2145 def is_outdated_version(version, limit, assume_new=True):
2147 return not assume_new
2149 return version_tuple(version) < version_tuple(limit)
2151 return not assume_new
2154 def ytdl_is_updateable():
2155 """ Returns if youtube-dl can be updated with -U """
2156 from zipimport import zipimporter
2158 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2161 def args_to_str(args):
2162 # Get a short string representation for a subprocess command
2163 return ' '.join(compat_shlex_quote(a) for a in args)
2166 def error_to_compat_str(err):
2168 # On python 2 error byte string must be decoded with proper
2169 # encoding rather than ascii
2170 if sys.version_info[0] < 3:
2171 err_str = err_str.decode(preferredencoding())
2175 def mimetype2ext(mt):
2181 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2182 # it's the most popular one
2183 'audio/mpeg': 'mp3',
2188 _, _, res = mt.rpartition('/')
2189 res = res.split(';')[0].strip().lower()
2193 'smptett+xml': 'tt',
2199 'x-mp4-fragmented': 'mp4',
2202 'x-mpegurl': 'm3u8',
2203 'vnd.apple.mpegurl': 'm3u8',
2208 'vnd.ms-sstr+xml': 'ism',
2213 def parse_codecs(codecs_str):
2214 # http://tools.ietf.org/html/rfc6381
2217 splited_codecs = list(filter(None, map(
2218 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2219 vcodec, acodec = None, None
2220 for full_codec in splited_codecs:
2221 codec = full_codec.split('.')[0]
2222 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2225 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2229 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2230 if not vcodec and not acodec:
2231 if len(splited_codecs) == 2:
2236 elif len(splited_codecs) == 1:
2243 'vcodec': vcodec or 'none',
2244 'acodec': acodec or 'none',
2249 def urlhandle_detect_ext(url_handle):
2250 getheader = url_handle.headers.get
2252 cd = getheader('Content-Disposition')
2254 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2256 e = determine_ext(m.group('filename'), default_ext=None)
2260 return mimetype2ext(getheader('Content-Type'))
2263 def encode_data_uri(data, mime_type):
2264 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2267 def age_restricted(content_limit, age_limit):
2268 """ Returns True iff the content should be blocked """
2270 if age_limit is None: # No limit set
2272 if content_limit is None:
2273 return False # Content available for everyone
2274 return age_limit < content_limit
2277 def is_html(first_bytes):
2278 """ Detect whether a file contains HTML by examining its first bytes. """
2281 (b'\xef\xbb\xbf', 'utf-8'),
2282 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2283 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2284 (b'\xff\xfe', 'utf-16-le'),
2285 (b'\xfe\xff', 'utf-16-be'),
2287 for bom, enc in BOMS:
2288 if first_bytes.startswith(bom):
2289 s = first_bytes[len(bom):].decode(enc, 'replace')
2292 s = first_bytes.decode('utf-8', 'replace')
2294 return re.match(r'^\s*<', s)
2297 def determine_protocol(info_dict):
2298 protocol = info_dict.get('protocol')
2299 if protocol is not None:
2302 url = info_dict['url']
2303 if url.startswith('rtmp'):
2305 elif url.startswith('mms'):
2307 elif url.startswith('rtsp'):
2310 ext = determine_ext(url)
2316 return compat_urllib_parse_urlparse(url).scheme
2319 def render_table(header_row, data):
2320 """ Render a list of rows, each as a list of values """
2321 table = [header_row] + data
2322 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2323 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2324 return '\n'.join(format_str % tuple(row) for row in table)
2327 def _match_one(filter_part, dct):
2328 COMPARISON_OPERATORS = {
2336 operator_rex = re.compile(r'''(?x)\s*
2338 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2340 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2341 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2344 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2345 m = operator_rex.search(filter_part)
2347 op = COMPARISON_OPERATORS[m.group('op')]
2348 actual_value = dct.get(m.group('key'))
2349 if (m.group('strval') is not None or
2350 # If the original field is a string and matching comparisonvalue is
2351 # a number we should respect the origin of the original field
2352 # and process comparison value as a string (see
2353 # https://github.com/rg3/youtube-dl/issues/11082).
2354 actual_value is not None and m.group('intval') is not None and
2355 isinstance(actual_value, compat_str)):
2356 if m.group('op') not in ('=', '!='):
2358 'Operator %s does not support string values!' % m.group('op'))
2359 comparison_value = m.group('strval') or m.group('intval')
2362 comparison_value = int(m.group('intval'))
2364 comparison_value = parse_filesize(m.group('intval'))
2365 if comparison_value is None:
2366 comparison_value = parse_filesize(m.group('intval') + 'B')
2367 if comparison_value is None:
2369 'Invalid integer value %r in filter part %r' % (
2370 m.group('intval'), filter_part))
2371 if actual_value is None:
2372 return m.group('none_inclusive')
2373 return op(actual_value, comparison_value)
2376 '': lambda v: v is not None,
2377 '!': lambda v: v is None,
2379 operator_rex = re.compile(r'''(?x)\s*
2380 (?P<op>%s)\s*(?P<key>[a-z_]+)
2382 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2383 m = operator_rex.search(filter_part)
2385 op = UNARY_OPERATORS[m.group('op')]
2386 actual_value = dct.get(m.group('key'))
2387 return op(actual_value)
2389 raise ValueError('Invalid filter part %r' % filter_part)
2392 def match_str(filter_str, dct):
2393 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2396 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2399 def match_filter_func(filter_str):
2400 def _match_func(info_dict):
2401 if match_str(filter_str, info_dict):
2404 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2405 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2409 def parse_dfxp_time_expr(time_expr):
2413 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2415 return float(mobj.group('time_offset'))
2417 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2419 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2422 def srt_subtitles_timecode(seconds):
2423 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2426 def dfxp2srt(dfxp_data):
2427 _x = functools.partial(xpath_with_ns, ns_map={
2428 'ttml': 'http://www.w3.org/ns/ttml',
2429 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2430 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2433 class TTMLPElementParser(object):
2436 def start(self, tag, attrib):
2437 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2443 def data(self, data):
2447 return self.out.strip()
2449 def parse_node(node):
2450 target = TTMLPElementParser()
2451 parser = xml.etree.ElementTree.XMLParser(target=target)
2452 parser.feed(xml.etree.ElementTree.tostring(node))
2453 return parser.close()
2455 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2457 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2460 raise ValueError('Invalid dfxp/TTML subtitle')
2462 for para, index in zip(paras, itertools.count(1)):
2463 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2464 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2465 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2466 if begin_time is None:
2471 end_time = begin_time + dur
2472 out.append('%d\n%s --> %s\n%s\n\n' % (
2474 srt_subtitles_timecode(begin_time),
2475 srt_subtitles_timecode(end_time),
2481 def cli_option(params, command_option, param):
2482 param = params.get(param)
2484 param = compat_str(param)
2485 return [command_option, param] if param is not None else []
2488 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2489 param = params.get(param)
2490 assert isinstance(param, bool)
2492 return [command_option + separator + (true_value if param else false_value)]
2493 return [command_option, true_value if param else false_value]
2496 def cli_valueless_option(params, command_option, param, expected_value=True):
2497 param = params.get(param)
2498 return [command_option] if param == expected_value else []
2501 def cli_configuration_args(params, param, default=[]):
2502 ex_args = params.get(param)
2505 assert isinstance(ex_args, list)
2509 class ISO639Utils(object):
2510 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2699 def short2long(cls, code):
2700 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2701 return cls._lang_map.get(code[:2])
2704 def long2short(cls, code):
2705 """Convert language code from ISO 639-2/T to ISO 639-1"""
2706 for short_name, long_name in cls._lang_map.items():
2707 if long_name == code:
2711 class ISO3166Utils(object):
2712 # From http://data.okfn.org/data/core/country-list
2714 'AF': 'Afghanistan',
2715 'AX': 'Åland Islands',
2718 'AS': 'American Samoa',
2723 'AG': 'Antigua and Barbuda',
2740 'BO': 'Bolivia, Plurinational State of',
2741 'BQ': 'Bonaire, Sint Eustatius and Saba',
2742 'BA': 'Bosnia and Herzegovina',
2744 'BV': 'Bouvet Island',
2746 'IO': 'British Indian Ocean Territory',
2747 'BN': 'Brunei Darussalam',
2749 'BF': 'Burkina Faso',
2755 'KY': 'Cayman Islands',
2756 'CF': 'Central African Republic',
2760 'CX': 'Christmas Island',
2761 'CC': 'Cocos (Keeling) Islands',
2765 'CD': 'Congo, the Democratic Republic of the',
2766 'CK': 'Cook Islands',
2768 'CI': 'Côte d\'Ivoire',
2773 'CZ': 'Czech Republic',
2777 'DO': 'Dominican Republic',
2780 'SV': 'El Salvador',
2781 'GQ': 'Equatorial Guinea',
2785 'FK': 'Falkland Islands (Malvinas)',
2786 'FO': 'Faroe Islands',
2790 'GF': 'French Guiana',
2791 'PF': 'French Polynesia',
2792 'TF': 'French Southern Territories',
2807 'GW': 'Guinea-Bissau',
2810 'HM': 'Heard Island and McDonald Islands',
2811 'VA': 'Holy See (Vatican City State)',
2818 'IR': 'Iran, Islamic Republic of',
2821 'IM': 'Isle of Man',
2831 'KP': 'Korea, Democratic People\'s Republic of',
2832 'KR': 'Korea, Republic of',
2835 'LA': 'Lao People\'s Democratic Republic',
2841 'LI': 'Liechtenstein',
2845 'MK': 'Macedonia, the Former Yugoslav Republic of',
2852 'MH': 'Marshall Islands',
2858 'FM': 'Micronesia, Federated States of',
2859 'MD': 'Moldova, Republic of',
2870 'NL': 'Netherlands',
2871 'NC': 'New Caledonia',
2872 'NZ': 'New Zealand',
2877 'NF': 'Norfolk Island',
2878 'MP': 'Northern Mariana Islands',
2883 'PS': 'Palestine, State of',
2885 'PG': 'Papua New Guinea',
2888 'PH': 'Philippines',
2892 'PR': 'Puerto Rico',
2896 'RU': 'Russian Federation',
2898 'BL': 'Saint Barthélemy',
2899 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2900 'KN': 'Saint Kitts and Nevis',
2901 'LC': 'Saint Lucia',
2902 'MF': 'Saint Martin (French part)',
2903 'PM': 'Saint Pierre and Miquelon',
2904 'VC': 'Saint Vincent and the Grenadines',
2907 'ST': 'Sao Tome and Principe',
2908 'SA': 'Saudi Arabia',
2912 'SL': 'Sierra Leone',
2914 'SX': 'Sint Maarten (Dutch part)',
2917 'SB': 'Solomon Islands',
2919 'ZA': 'South Africa',
2920 'GS': 'South Georgia and the South Sandwich Islands',
2921 'SS': 'South Sudan',
2926 'SJ': 'Svalbard and Jan Mayen',
2929 'CH': 'Switzerland',
2930 'SY': 'Syrian Arab Republic',
2931 'TW': 'Taiwan, Province of China',
2933 'TZ': 'Tanzania, United Republic of',
2935 'TL': 'Timor-Leste',
2939 'TT': 'Trinidad and Tobago',
2942 'TM': 'Turkmenistan',
2943 'TC': 'Turks and Caicos Islands',
2947 'AE': 'United Arab Emirates',
2948 'GB': 'United Kingdom',
2949 'US': 'United States',
2950 'UM': 'United States Minor Outlying Islands',
2954 'VE': 'Venezuela, Bolivarian Republic of',
2956 'VG': 'Virgin Islands, British',
2957 'VI': 'Virgin Islands, U.S.',
2958 'WF': 'Wallis and Futuna',
2959 'EH': 'Western Sahara',
2966 def short2full(cls, code):
2967 """Convert an ISO 3166-2 country code to the corresponding full name"""
2968 return cls._country_map.get(code.upper())
2971 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2972 def __init__(self, proxies=None):
2973 # Set default handlers
2974 for type in ('http', 'https'):
2975 setattr(self, '%s_open' % type,
2976 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2977 meth(r, proxy, type))
2978 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2980 def proxy_open(self, req, proxy, type):
2981 req_proxy = req.headers.get('Ytdl-request-proxy')
2982 if req_proxy is not None:
2984 del req.headers['Ytdl-request-proxy']
2986 if proxy == '__noproxy__':
2987 return None # No Proxy
2988 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2989 req.add_header('Ytdl-socks-proxy', proxy)
2990 # youtube-dl's http/https handlers do wrapping the socket with socks
2992 return compat_urllib_request.ProxyHandler.proxy_open(
2993 self, req, proxy, type)
2996 def ohdave_rsa_encrypt(data, exponent, modulus):
2998 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3001 data: data to encrypt, bytes-like object
3002 exponent, modulus: parameter e and N of RSA algorithm, both integer
3003 Output: hex string of encrypted data
3005 Limitation: supports one block encryption only
3008 payload = int(binascii.hexlify(data[::-1]), 16)
3009 encrypted = pow(payload, exponent, modulus)
3010 return '%x' % encrypted
3013 def encode_base_n(num, n, table=None):
3014 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3016 table = FULL_TABLE[:n]
3019 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3026 ret = table[num % n] + ret
3031 def decode_packed_codes(code):
3032 mobj = re.search(PACKED_CODES_RE, code)
3033 obfucasted_code, base, count, symbols = mobj.groups()
3036 symbols = symbols.split('|')
3041 base_n_count = encode_base_n(count, base)
3042 symbol_table[base_n_count] = symbols[count] or base_n_count
3045 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3049 def parse_m3u8_attributes(attrib):
3051 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3052 if val.startswith('"'):
3058 def urshift(val, n):
3059 return val >> n if val >= 0 else (val + 0x100000000) >> n
3062 # Based on png2str() written by @gdkchan and improved by @yokrysty
3063 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3064 def decode_png(png_data):
3065 # Reference: https://www.w3.org/TR/PNG/
3066 header = png_data[8:]
3068 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3069 raise IOError('Not a valid PNG file.')
3071 int_map = {1: '>B', 2: '>H', 4: '>I'}
3072 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3077 length = unpack_integer(header[:4])
3080 chunk_type = header[:4]
3083 chunk_data = header[:length]
3084 header = header[length:]
3086 header = header[4:] # Skip CRC
3094 ihdr = chunks[0]['data']
3096 width = unpack_integer(ihdr[:4])
3097 height = unpack_integer(ihdr[4:8])
3101 for chunk in chunks:
3102 if chunk['type'] == b'IDAT':
3103 idat += chunk['data']
3106 raise IOError('Unable to read PNG data.')
3108 decompressed_data = bytearray(zlib.decompress(idat))
3113 def _get_pixel(idx):
3118 for y in range(height):
3119 basePos = y * (1 + stride)
3120 filter_type = decompressed_data[basePos]
3124 pixels.append(current_row)
3126 for x in range(stride):
3127 color = decompressed_data[1 + basePos + x]
3128 basex = y * stride + x
3133 left = _get_pixel(basex - 3)
3135 up = _get_pixel(basex - stride)
3137 if filter_type == 1: # Sub
3138 color = (color + left) & 0xff
3139 elif filter_type == 2: # Up
3140 color = (color + up) & 0xff
3141 elif filter_type == 3: # Average
3142 color = (color + ((left + up) >> 1)) & 0xff
3143 elif filter_type == 4: # Paeth
3149 c = _get_pixel(basex - stride - 3)
3157 if pa <= pb and pa <= pc:
3158 color = (color + a) & 0xff
3160 color = (color + b) & 0xff
3162 color = (color + c) & 0xff
3164 current_row.append(color)
3166 return width, height, pixels
3169 def write_xattr(path, key, value):
3170 # This mess below finds the best xattr tool for the job
3172 # try the pyxattr module...
3175 if hasattr(xattr, 'set'): # pyxattr
3176 # Unicode arguments are not supported in python-pyxattr until
3178 # See https://github.com/rg3/youtube-dl/issues/5498
3179 pyxattr_required_version = '0.5.0'
3180 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3181 # TODO: fallback to CLI tools
3182 raise XAttrUnavailableError(
3183 'python-pyxattr is detected but is too old. '
3184 'youtube-dl requires %s or above while your version is %s. '
3185 'Falling back to other xattr implementations' % (
3186 pyxattr_required_version, xattr.__version__))
3188 setxattr = xattr.set
3190 setxattr = xattr.setxattr
3193 setxattr(path, key, value)
3194 except EnvironmentError as e:
3195 raise XAttrMetadataError(e.errno, e.strerror)
3198 if compat_os_name == 'nt':
3199 # Write xattrs to NTFS Alternate Data Streams:
3200 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3201 assert ':' not in key
3202 assert os.path.exists(path)
3204 ads_fn = path + ':' + key
3206 with open(ads_fn, 'wb') as f:
3208 except EnvironmentError as e:
3209 raise XAttrMetadataError(e.errno, e.strerror)
3211 user_has_setfattr = check_executable('setfattr', ['--version'])
3212 user_has_xattr = check_executable('xattr', ['-h'])
3214 if user_has_setfattr or user_has_xattr:
3216 value = value.decode('utf-8')
3217 if user_has_setfattr:
3218 executable = 'setfattr'
3219 opts = ['-n', key, '-v', value]
3220 elif user_has_xattr:
3221 executable = 'xattr'
3222 opts = ['-w', key, value]
3224 cmd = ([encodeFilename(executable, True)] +
3225 [encodeArgument(o) for o in opts] +
3226 [encodeFilename(path, True)])
3229 p = subprocess.Popen(
3230 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3231 except EnvironmentError as e:
3232 raise XAttrMetadataError(e.errno, e.strerror)
3233 stdout, stderr = p.communicate()
3234 stderr = stderr.decode('utf-8', 'replace')
3235 if p.returncode != 0:
3236 raise XAttrMetadataError(p.returncode, stderr)
3239 # On Unix, and can't find pyxattr, setfattr, or xattr.
3240 if sys.platform.startswith('linux'):
3241 raise XAttrUnavailableError(
3242 "Couldn't find a tool to set the xattrs. "
3243 "Install either the python 'pyxattr' or 'xattr' "
3244 "modules, or the GNU 'attr' package "
3245 "(which contains the 'setfattr' tool).")
3247 raise XAttrUnavailableError(
3248 "Couldn't find a tool to set the xattrs. "
3249 "Install either the python 'xattr' module, "
3250 "or the 'xattr' binary.")