4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
42 compat_html_entities_html5,
48 compat_socket_create_connection,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
96 ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
101 'en': ENGLISH_MONTH_NAMES,
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
120 'f4f', 'f4m', 'm3u8', 'smil')
122 # needed for sanitizing filenames in restricted mode
123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
147 '%Y-%m-%d %H:%M:%S.%f',
150 '%Y-%m-%dT%H:%M:%SZ',
151 '%Y-%m-%dT%H:%M:%S.%fZ',
152 '%Y-%m-%dT%H:%M:%S.%f0Z',
154 '%Y-%m-%dT%H:%M:%S.%f',
157 '%b %d %Y at %H:%M:%S',
160 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
161 DATE_FORMATS_DAY_FIRST.extend([
170 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
171 DATE_FORMATS_MONTH_FIRST.extend([
179 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
182 def preferredencoding():
183 """Get preferred encoding.
185 Returns the best encoding scheme for the system, based on
186 locale.getpreferredencoding() and some further tweaks.
189 pref = locale.getpreferredencoding()
197 def write_json_file(obj, fn):
198 """ Encode obj as JSON and write it to fn, atomically if possible """
200 fn = encodeFilename(fn)
201 if sys.version_info < (3, 0) and sys.platform != 'win32':
202 encoding = get_filesystem_encoding()
203 # os.path.basename returns a bytes object, but NamedTemporaryFile
204 # will fail if the filename contains non ascii characters unless we
205 # use a unicode object
206 path_basename = lambda f: os.path.basename(fn).decode(encoding)
207 # the same for os.path.dirname
208 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
210 path_basename = os.path.basename
211 path_dirname = os.path.dirname
215 'prefix': path_basename(fn) + '.',
216 'dir': path_dirname(fn),
220 # In Python 2.x, json.dump expects a bytestream.
221 # In Python 3.x, it writes to a character stream
222 if sys.version_info < (3, 0):
230 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
235 if sys.platform == 'win32':
236 # Need to remove existing file on Windows, else os.rename raises
237 # WindowsError or FileExistsError.
242 os.rename(tf.name, fn)
251 if sys.version_info >= (2, 7):
252 def find_xpath_attr(node, xpath, key, val=None):
253 """ Find the xpath xpath[@key=val] """
254 assert re.match(r'^[a-zA-Z_-]+$', key)
255 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
256 return node.find(expr)
258 def find_xpath_attr(node, xpath, key, val=None):
259 for f in node.findall(compat_xpath(xpath)):
260 if key not in f.attrib:
262 if val is None or f.attrib.get(key) == val:
266 # On python2.6 the xml.etree.ElementTree.Element methods don't support
267 # the namespace parameter
270 def xpath_with_ns(path, ns_map):
271 components = [c.split(':') for c in path.split('/')]
275 replaced.append(c[0])
278 replaced.append('{%s}%s' % (ns_map[ns], tag))
279 return '/'.join(replaced)
282 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
283 def _find_xpath(xpath):
284 return node.find(compat_xpath(xpath))
286 if isinstance(xpath, (str, compat_str)):
287 n = _find_xpath(xpath)
295 if default is not NO_DEFAULT:
298 name = xpath if name is None else name
299 raise ExtractorError('Could not find XML element %s' % name)
305 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
306 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
307 if n is None or n == default:
310 if default is not NO_DEFAULT:
313 name = xpath if name is None else name
314 raise ExtractorError('Could not find XML element\'s text %s' % name)
320 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
321 n = find_xpath_attr(node, xpath, key)
323 if default is not NO_DEFAULT:
326 name = '%s[@%s]' % (xpath, key) if name is None else name
327 raise ExtractorError('Could not find XML attribute %s' % name)
333 def get_element_by_id(id, html):
334 """Return the content of the tag with the specified ID in the passed HTML document"""
335 return get_element_by_attribute('id', id, html)
338 def get_element_by_class(class_name, html):
339 return get_element_by_attribute(
340 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
341 html, escape_value=False)
344 def get_element_by_attribute(attribute, value, html, escape_value=True):
345 """Return the content of the tag with the specified attribute in the passed HTML document"""
347 value = re.escape(value) if escape_value else value
349 m = re.search(r'''(?xs)
351 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
353 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
357 ''' % (re.escape(attribute), value), html)
361 res = m.group('content')
363 if res.startswith('"') or res.startswith("'"):
366 return unescapeHTML(res)
369 class HTMLAttributeParser(compat_HTMLParser):
370 """Trivial HTML parser to gather the attributes for a single element"""
373 compat_HTMLParser.__init__(self)
375 def handle_starttag(self, tag, attrs):
376 self.attrs = dict(attrs)
379 def extract_attributes(html_element):
380 """Given a string for an HTML element such as
382 a="foo" B="bar" c="&98;az" d=boz
383 empty= noval entity="&"
386 Decode and return a dictionary of attributes.
388 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
389 'empty': '', 'noval': None, 'entity': '&',
390 'sq': '"', 'dq': '\''
392 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
393 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
395 parser = HTMLAttributeParser()
396 parser.feed(html_element)
401 def clean_html(html):
402 """Clean an HTML snippet into a readable string"""
404 if html is None: # Convenience for sanitizing descriptions etc.
408 html = html.replace('\n', ' ')
409 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
410 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
412 html = re.sub('<.*?>', '', html)
413 # Replace html entities
414 html = unescapeHTML(html)
418 def sanitize_open(filename, open_mode):
419 """Try to open the given filename, and slightly tweak it if this fails.
421 Attempts to open the given filename. If this fails, it tries to change
422 the filename slightly, step by step, until it's either able to open it
423 or it fails and raises a final exception, like the standard open()
426 It returns the tuple (stream, definitive_file_name).
430 if sys.platform == 'win32':
432 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
433 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
434 stream = open(encodeFilename(filename), open_mode)
435 return (stream, filename)
436 except (IOError, OSError) as err:
437 if err.errno in (errno.EACCES,):
440 # In case of error, try to remove win32 forbidden chars
441 alt_filename = sanitize_path(filename)
442 if alt_filename == filename:
445 # An exception here should be caught in the caller
446 stream = open(encodeFilename(alt_filename), open_mode)
447 return (stream, alt_filename)
450 def timeconvert(timestr):
451 """Convert RFC 2822 defined time string into system timestamp"""
453 timetuple = email.utils.parsedate_tz(timestr)
454 if timetuple is not None:
455 timestamp = email.utils.mktime_tz(timetuple)
459 def sanitize_filename(s, restricted=False, is_id=False):
460 """Sanitizes a string so it could be used as part of a filename.
461 If restricted is set, use a stricter subset of allowed characters.
462 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
464 def replace_insane(char):
465 if restricted and char in ACCENT_CHARS:
466 return ACCENT_CHARS[char]
467 if char == '?' or ord(char) < 32 or ord(char) == 127:
470 return '' if restricted else '\''
472 return '_-' if restricted else ' -'
473 elif char in '\\/|*<>':
475 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
477 if restricted and ord(char) > 127:
482 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
483 result = ''.join(map(replace_insane, s))
485 while '__' in result:
486 result = result.replace('__', '_')
487 result = result.strip('_')
488 # Common case of "Foreign band name - English song title"
489 if restricted and result.startswith('-_'):
491 if result.startswith('-'):
492 result = '_' + result[len('-'):]
493 result = result.lstrip('.')
499 def sanitize_path(s):
500 """Sanitizes and normalizes path on Windows"""
501 if sys.platform != 'win32':
503 drive_or_unc, _ = os.path.splitdrive(s)
504 if sys.version_info < (2, 7) and not drive_or_unc:
505 drive_or_unc, _ = os.path.splitunc(s)
506 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
510 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
511 for path_part in norm_path]
513 sanitized_path.insert(0, drive_or_unc + os.path.sep)
514 return os.path.join(*sanitized_path)
517 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
518 # unwanted failures due to missing protocol
519 def sanitize_url(url):
520 return 'http:%s' % url if url.startswith('//') else url
523 def sanitized_Request(url, *args, **kwargs):
524 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
527 def orderedSet(iterable):
528 """ Remove all duplicates from the input iterable """
536 def _htmlentity_transform(entity_with_semicolon):
537 """Transforms an HTML entity to a character."""
538 entity = entity_with_semicolon[:-1]
540 # Known non-numeric HTML entity
541 if entity in compat_html_entities.name2codepoint:
542 return compat_chr(compat_html_entities.name2codepoint[entity])
544 # TODO: HTML5 allows entities without a semicolon. For example,
545 # 'Éric' should be decoded as 'Éric'.
546 if entity_with_semicolon in compat_html_entities_html5:
547 return compat_html_entities_html5[entity_with_semicolon]
549 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
551 numstr = mobj.group(1)
552 if numstr.startswith('x'):
554 numstr = '0%s' % numstr
557 # See https://github.com/rg3/youtube-dl/issues/7518
559 return compat_chr(int(numstr, base))
563 # Unknown entity in name, return its literal representation
564 return '&%s;' % entity
570 assert type(s) == compat_str
573 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
576 def get_subprocess_encoding():
577 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
578 # For subprocess calls, encode with locale encoding
579 # Refer to http://stackoverflow.com/a/9951851/35070
580 encoding = preferredencoding()
582 encoding = sys.getfilesystemencoding()
588 def encodeFilename(s, for_subprocess=False):
590 @param s The name of the file
593 assert type(s) == compat_str
595 # Python 3 has a Unicode API
596 if sys.version_info >= (3, 0):
599 # Pass '' directly to use Unicode APIs on Windows 2000 and up
600 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
601 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
602 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
605 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
606 if sys.platform.startswith('java'):
609 return s.encode(get_subprocess_encoding(), 'ignore')
612 def decodeFilename(b, for_subprocess=False):
614 if sys.version_info >= (3, 0):
617 if not isinstance(b, bytes):
620 return b.decode(get_subprocess_encoding(), 'ignore')
623 def encodeArgument(s):
624 if not isinstance(s, compat_str):
625 # Legacy code that uses byte strings
626 # Uncomment the following line after fixing all post processors
627 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
628 s = s.decode('ascii')
629 return encodeFilename(s, True)
632 def decodeArgument(b):
633 return decodeFilename(b, True)
636 def decodeOption(optval):
639 if isinstance(optval, bytes):
640 optval = optval.decode(preferredencoding())
642 assert isinstance(optval, compat_str)
646 def formatSeconds(secs):
648 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
650 return '%d:%02d' % (secs // 60, secs % 60)
655 def make_HTTPS_handler(params, **kwargs):
656 opts_no_check_certificate = params.get('nocheckcertificate', False)
657 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
658 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
659 if opts_no_check_certificate:
660 context.check_hostname = False
661 context.verify_mode = ssl.CERT_NONE
663 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
666 # (create_default_context present but HTTPSHandler has no context=)
669 if sys.version_info < (3, 2):
670 return YoutubeDLHTTPSHandler(params, **kwargs)
672 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
673 context.verify_mode = (ssl.CERT_NONE
674 if opts_no_check_certificate
675 else ssl.CERT_REQUIRED)
676 context.set_default_verify_paths()
677 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
680 def bug_reports_message():
681 if ytdl_is_updateable():
682 update_cmd = 'type youtube-dl -U to update'
684 update_cmd = 'see https://yt-dl.org/update on how to update'
685 msg = '; please report this issue on https://yt-dl.org/bug .'
686 msg += ' Make sure you are using the latest version; %s.' % update_cmd
687 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
691 class ExtractorError(Exception):
692 """Error during info extraction."""
694 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
695 """ tb, if given, is the original traceback (so that it can be printed out).
696 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
699 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
701 if video_id is not None:
702 msg = video_id + ': ' + msg
704 msg += ' (caused by %r)' % cause
706 msg += bug_reports_message()
707 super(ExtractorError, self).__init__(msg)
710 self.exc_info = sys.exc_info() # preserve original exception
712 self.video_id = video_id
714 def format_traceback(self):
715 if self.traceback is None:
717 return ''.join(traceback.format_tb(self.traceback))
720 class UnsupportedError(ExtractorError):
721 def __init__(self, url):
722 super(UnsupportedError, self).__init__(
723 'Unsupported URL: %s' % url, expected=True)
727 class RegexNotFoundError(ExtractorError):
728 """Error when a regex didn't match"""
732 class DownloadError(Exception):
733 """Download Error exception.
735 This exception may be thrown by FileDownloader objects if they are not
736 configured to continue on errors. They will contain the appropriate
740 def __init__(self, msg, exc_info=None):
741 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
742 super(DownloadError, self).__init__(msg)
743 self.exc_info = exc_info
746 class SameFileError(Exception):
747 """Same File exception.
749 This exception will be thrown by FileDownloader objects if they detect
750 multiple files would have to be downloaded to the same file on disk.
755 class PostProcessingError(Exception):
756 """Post Processing exception.
758 This exception may be raised by PostProcessor's .run() method to
759 indicate an error in the postprocessing task.
762 def __init__(self, msg):
766 class MaxDownloadsReached(Exception):
767 """ --max-downloads limit has been reached. """
771 class UnavailableVideoError(Exception):
772 """Unavailable Format exception.
774 This exception will be thrown when a video is requested
775 in a format that is not available for that video.
780 class ContentTooShortError(Exception):
781 """Content Too Short exception.
783 This exception may be raised by FileDownloader objects when a file they
784 download is too small for what the server announced first, indicating
785 the connection was probably interrupted.
788 def __init__(self, downloaded, expected):
790 self.downloaded = downloaded
791 self.expected = expected
794 class XAttrMetadataError(Exception):
795 def __init__(self, code=None, msg='Unknown error'):
796 super(XAttrMetadataError, self).__init__(msg)
800 # Parsing code and msg
801 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
802 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
803 self.reason = 'NO_SPACE'
804 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
805 self.reason = 'VALUE_TOO_LONG'
807 self.reason = 'NOT_SUPPORTED'
810 class XAttrUnavailableError(Exception):
814 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
815 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
816 # expected HTTP responses to meet HTTP/1.0 or later (see also
817 # https://github.com/rg3/youtube-dl/issues/6727)
818 if sys.version_info < (3, 0):
819 kwargs[b'strict'] = True
820 hc = http_class(*args, **kwargs)
821 source_address = ydl_handler._params.get('source_address')
822 if source_address is not None:
823 sa = (source_address, 0)
824 if hasattr(hc, 'source_address'): # Python 2.7+
825 hc.source_address = sa
827 def _hc_connect(self, *args, **kwargs):
828 sock = compat_socket_create_connection(
829 (self.host, self.port), self.timeout, sa)
831 self.sock = ssl.wrap_socket(
832 sock, self.key_file, self.cert_file,
833 ssl_version=ssl.PROTOCOL_TLSv1)
836 hc.connect = functools.partial(_hc_connect, hc)
841 def handle_youtubedl_headers(headers):
842 filtered_headers = headers
844 if 'Youtubedl-no-compression' in filtered_headers:
845 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
846 del filtered_headers['Youtubedl-no-compression']
848 return filtered_headers
851 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
852 """Handler for HTTP requests and responses.
854 This class, when installed with an OpenerDirector, automatically adds
855 the standard headers to every HTTP request and handles gzipped and
856 deflated responses from web servers. If compression is to be avoided in
857 a particular request, the original request in the program code only has
858 to include the HTTP header "Youtubedl-no-compression", which will be
859 removed before making the real request.
861 Part of this code was copied from:
863 http://techknack.net/python-urllib2-handlers/
865 Andrew Rowls, the author of that code, agreed to release it to the
869 def __init__(self, params, *args, **kwargs):
870 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
871 self._params = params
873 def http_open(self, req):
874 conn_class = compat_http_client.HTTPConnection
876 socks_proxy = req.headers.get('Ytdl-socks-proxy')
878 conn_class = make_socks_conn_class(conn_class, socks_proxy)
879 del req.headers['Ytdl-socks-proxy']
881 return self.do_open(functools.partial(
882 _create_http_connection, self, conn_class, False),
888 return zlib.decompress(data, -zlib.MAX_WBITS)
890 return zlib.decompress(data)
893 def addinfourl_wrapper(stream, headers, url, code):
894 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
895 return compat_urllib_request.addinfourl(stream, headers, url, code)
896 ret = compat_urllib_request.addinfourl(stream, headers, url)
900 def http_request(self, req):
901 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
902 # always respected by websites, some tend to give out URLs with non percent-encoded
903 # non-ASCII characters (see telemb.py, ard.py [#3412])
904 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
905 # To work around aforementioned issue we will replace request's original URL with
906 # percent-encoded one
907 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
908 # the code of this workaround has been moved here from YoutubeDL.urlopen()
909 url = req.get_full_url()
910 url_escaped = escape_url(url)
912 # Substitute URL if any change after escaping
913 if url != url_escaped:
914 req = update_Request(req, url=url_escaped)
916 for h, v in std_headers.items():
917 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
918 # The dict keys are capitalized because of this bug by urllib
919 if h.capitalize() not in req.headers:
922 req.headers = handle_youtubedl_headers(req.headers)
924 if sys.version_info < (2, 7) and '#' in req.get_full_url():
925 # Python 2.6 is brain-dead when it comes to fragments
926 req._Request__original = req._Request__original.partition('#')[0]
927 req._Request__r_type = req._Request__r_type.partition('#')[0]
931 def http_response(self, req, resp):
934 if resp.headers.get('Content-encoding', '') == 'gzip':
935 content = resp.read()
936 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
938 uncompressed = io.BytesIO(gz.read())
939 except IOError as original_ioerror:
940 # There may be junk add the end of the file
941 # See http://stackoverflow.com/q/4928560/35070 for details
942 for i in range(1, 1024):
944 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
945 uncompressed = io.BytesIO(gz.read())
950 raise original_ioerror
951 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
952 resp.msg = old_resp.msg
953 del resp.headers['Content-encoding']
955 if resp.headers.get('Content-encoding', '') == 'deflate':
956 gz = io.BytesIO(self.deflate(resp.read()))
957 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
958 resp.msg = old_resp.msg
959 del resp.headers['Content-encoding']
960 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
961 # https://github.com/rg3/youtube-dl/issues/6457).
962 if 300 <= resp.code < 400:
963 location = resp.headers.get('Location')
965 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
966 if sys.version_info >= (3, 0):
967 location = location.encode('iso-8859-1').decode('utf-8')
969 location = location.decode('utf-8')
970 location_escaped = escape_url(location)
971 if location != location_escaped:
972 del resp.headers['Location']
973 if sys.version_info < (3, 0):
974 location_escaped = location_escaped.encode('utf-8')
975 resp.headers['Location'] = location_escaped
978 https_request = http_request
979 https_response = http_response
982 def make_socks_conn_class(base_class, socks_proxy):
983 assert issubclass(base_class, (
984 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
986 url_components = compat_urlparse.urlparse(socks_proxy)
987 if url_components.scheme.lower() == 'socks5':
988 socks_type = ProxyType.SOCKS5
989 elif url_components.scheme.lower() in ('socks', 'socks4'):
990 socks_type = ProxyType.SOCKS4
991 elif url_components.scheme.lower() == 'socks4a':
992 socks_type = ProxyType.SOCKS4A
994 def unquote_if_non_empty(s):
997 return compat_urllib_parse_unquote_plus(s)
1001 url_components.hostname, url_components.port or 1080,
1003 unquote_if_non_empty(url_components.username),
1004 unquote_if_non_empty(url_components.password),
1007 class SocksConnection(base_class):
1009 self.sock = sockssocket()
1010 self.sock.setproxy(*proxy_args)
1011 if type(self.timeout) in (int, float):
1012 self.sock.settimeout(self.timeout)
1013 self.sock.connect((self.host, self.port))
1015 if isinstance(self, compat_http_client.HTTPSConnection):
1016 if hasattr(self, '_context'): # Python > 2.6
1017 self.sock = self._context.wrap_socket(
1018 self.sock, server_hostname=self.host)
1020 self.sock = ssl.wrap_socket(self.sock)
1022 return SocksConnection
1025 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1026 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1027 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1028 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1029 self._params = params
1031 def https_open(self, req):
1033 conn_class = self._https_conn_class
1035 if hasattr(self, '_context'): # python > 2.6
1036 kwargs['context'] = self._context
1037 if hasattr(self, '_check_hostname'): # python 3.x
1038 kwargs['check_hostname'] = self._check_hostname
1040 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1042 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1043 del req.headers['Ytdl-socks-proxy']
1045 return self.do_open(functools.partial(
1046 _create_http_connection, self, conn_class, True),
1050 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1051 def __init__(self, cookiejar=None):
1052 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1054 def http_response(self, request, response):
1055 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1056 # characters in Set-Cookie HTTP header of last response (see
1057 # https://github.com/rg3/youtube-dl/issues/6769).
1058 # In order to at least prevent crashing we will percent encode Set-Cookie
1059 # header before HTTPCookieProcessor starts processing it.
1060 # if sys.version_info < (3, 0) and response.headers:
1061 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1062 # set_cookie = response.headers.get(set_cookie_header)
1064 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1065 # if set_cookie != set_cookie_escaped:
1066 # del response.headers[set_cookie_header]
1067 # response.headers[set_cookie_header] = set_cookie_escaped
1068 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1070 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1071 https_response = http_response
1074 def extract_timezone(date_str):
1076 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1079 timezone = datetime.timedelta()
1081 date_str = date_str[:-len(m.group('tz'))]
1082 if not m.group('sign'):
1083 timezone = datetime.timedelta()
1085 sign = 1 if m.group('sign') == '+' else -1
1086 timezone = datetime.timedelta(
1087 hours=sign * int(m.group('hours')),
1088 minutes=sign * int(m.group('minutes')))
1089 return timezone, date_str
1092 def parse_iso8601(date_str, delimiter='T', timezone=None):
1093 """ Return a UNIX timestamp from the given date """
1095 if date_str is None:
1098 date_str = re.sub(r'\.[0-9]+', '', date_str)
1100 if timezone is None:
1101 timezone, date_str = extract_timezone(date_str)
1104 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1105 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1106 return calendar.timegm(dt.timetuple())
1111 def date_formats(day_first=True):
1112 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1115 def unified_strdate(date_str, day_first=True):
1116 """Return a string with the date in the format YYYYMMDD"""
1118 if date_str is None:
1122 date_str = date_str.replace(',', ' ')
1123 # Remove AM/PM + timezone
1124 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1125 _, date_str = extract_timezone(date_str)
1127 for expression in date_formats(day_first):
1129 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1132 if upload_date is None:
1133 timetuple = email.utils.parsedate_tz(date_str)
1136 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1139 if upload_date is not None:
1140 return compat_str(upload_date)
1143 def unified_timestamp(date_str, day_first=True):
1144 if date_str is None:
1147 date_str = date_str.replace(',', ' ')
1149 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1150 timezone, date_str = extract_timezone(date_str)
1152 # Remove AM/PM + timezone
1153 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1155 for expression in date_formats(day_first):
1157 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1158 return calendar.timegm(dt.timetuple())
1161 timetuple = email.utils.parsedate_tz(date_str)
1163 return calendar.timegm(timetuple) + pm_delta * 3600
1166 def determine_ext(url, default_ext='unknown_video'):
1169 guess = url.partition('?')[0].rpartition('.')[2]
1170 if re.match(r'^[A-Za-z0-9]+$', guess):
1172 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1173 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1174 return guess.rstrip('/')
1179 def subtitles_filename(filename, sub_lang, sub_format):
1180 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1183 def date_from_str(date_str):
1185 Return a datetime object from a string in the format YYYYMMDD or
1186 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1187 today = datetime.date.today()
1188 if date_str in ('now', 'today'):
1190 if date_str == 'yesterday':
1191 return today - datetime.timedelta(days=1)
1192 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1193 if match is not None:
1194 sign = match.group('sign')
1195 time = int(match.group('time'))
1198 unit = match.group('unit')
1199 # A bad approximation?
1203 elif unit == 'year':
1207 delta = datetime.timedelta(**{unit: time})
1208 return today + delta
1209 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1212 def hyphenate_date(date_str):
1214 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1215 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1216 if match is not None:
1217 return '-'.join(match.groups())
1222 class DateRange(object):
1223 """Represents a time interval between two dates"""
1225 def __init__(self, start=None, end=None):
1226 """start and end must be strings in the format accepted by date"""
1227 if start is not None:
1228 self.start = date_from_str(start)
1230 self.start = datetime.datetime.min.date()
1232 self.end = date_from_str(end)
1234 self.end = datetime.datetime.max.date()
1235 if self.start > self.end:
1236 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1240 """Returns a range that only contains the given day"""
1241 return cls(day, day)
1243 def __contains__(self, date):
1244 """Check if the date is in the range"""
1245 if not isinstance(date, datetime.date):
1246 date = date_from_str(date)
1247 return self.start <= date <= self.end
1250 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1253 def platform_name():
1254 """ Returns the platform name as a compat_str """
1255 res = platform.platform()
1256 if isinstance(res, bytes):
1257 res = res.decode(preferredencoding())
1259 assert isinstance(res, compat_str)
1263 def _windows_write_string(s, out):
1264 """ Returns True if the string was written using special methods,
1265 False if it has yet to be written out."""
1266 # Adapted from http://stackoverflow.com/a/3259271/35070
1269 import ctypes.wintypes
1277 fileno = out.fileno()
1278 except AttributeError:
1279 # If the output stream doesn't have a fileno, it's virtual
1281 except io.UnsupportedOperation:
1282 # Some strange Windows pseudo files?
1284 if fileno not in WIN_OUTPUT_IDS:
1287 GetStdHandle = ctypes.WINFUNCTYPE(
1288 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1289 (b'GetStdHandle', ctypes.windll.kernel32))
1290 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1292 WriteConsoleW = ctypes.WINFUNCTYPE(
1293 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1294 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1295 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1296 written = ctypes.wintypes.DWORD(0)
1298 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1299 FILE_TYPE_CHAR = 0x0002
1300 FILE_TYPE_REMOTE = 0x8000
1301 GetConsoleMode = ctypes.WINFUNCTYPE(
1302 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1303 ctypes.POINTER(ctypes.wintypes.DWORD))(
1304 (b'GetConsoleMode', ctypes.windll.kernel32))
1305 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1307 def not_a_console(handle):
1308 if handle == INVALID_HANDLE_VALUE or handle is None:
1310 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1311 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1313 if not_a_console(h):
1316 def next_nonbmp_pos(s):
1318 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1319 except StopIteration:
1323 count = min(next_nonbmp_pos(s), 1024)
1325 ret = WriteConsoleW(
1326 h, s, count if count else 2, ctypes.byref(written), None)
1328 raise OSError('Failed to write string')
1329 if not count: # We just wrote a non-BMP character
1330 assert written.value == 2
1333 assert written.value > 0
1334 s = s[written.value:]
1338 def write_string(s, out=None, encoding=None):
1341 assert type(s) == compat_str
1343 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1344 if _windows_write_string(s, out):
1347 if ('b' in getattr(out, 'mode', '') or
1348 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1349 byt = s.encode(encoding or preferredencoding(), 'ignore')
1351 elif hasattr(out, 'buffer'):
1352 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1353 byt = s.encode(enc, 'ignore')
1354 out.buffer.write(byt)
1360 def bytes_to_intlist(bs):
1363 if isinstance(bs[0], int): # Python 3
1366 return [ord(c) for c in bs]
1369 def intlist_to_bytes(xs):
1372 return compat_struct_pack('%dB' % len(xs), *xs)
1375 # Cross-platform file locking
1376 if sys.platform == 'win32':
1377 import ctypes.wintypes
1380 class OVERLAPPED(ctypes.Structure):
1382 ('Internal', ctypes.wintypes.LPVOID),
1383 ('InternalHigh', ctypes.wintypes.LPVOID),
1384 ('Offset', ctypes.wintypes.DWORD),
1385 ('OffsetHigh', ctypes.wintypes.DWORD),
1386 ('hEvent', ctypes.wintypes.HANDLE),
1389 kernel32 = ctypes.windll.kernel32
1390 LockFileEx = kernel32.LockFileEx
1391 LockFileEx.argtypes = [
1392 ctypes.wintypes.HANDLE, # hFile
1393 ctypes.wintypes.DWORD, # dwFlags
1394 ctypes.wintypes.DWORD, # dwReserved
1395 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1396 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1397 ctypes.POINTER(OVERLAPPED) # Overlapped
1399 LockFileEx.restype = ctypes.wintypes.BOOL
1400 UnlockFileEx = kernel32.UnlockFileEx
1401 UnlockFileEx.argtypes = [
1402 ctypes.wintypes.HANDLE, # hFile
1403 ctypes.wintypes.DWORD, # dwReserved
1404 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1405 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1406 ctypes.POINTER(OVERLAPPED) # Overlapped
1408 UnlockFileEx.restype = ctypes.wintypes.BOOL
1409 whole_low = 0xffffffff
1410 whole_high = 0x7fffffff
1412 def _lock_file(f, exclusive):
1413 overlapped = OVERLAPPED()
1414 overlapped.Offset = 0
1415 overlapped.OffsetHigh = 0
1416 overlapped.hEvent = 0
1417 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1418 handle = msvcrt.get_osfhandle(f.fileno())
1419 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1420 whole_low, whole_high, f._lock_file_overlapped_p):
1421 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1423 def _unlock_file(f):
1424 assert f._lock_file_overlapped_p
1425 handle = msvcrt.get_osfhandle(f.fileno())
1426 if not UnlockFileEx(handle, 0,
1427 whole_low, whole_high, f._lock_file_overlapped_p):
1428 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1431 # Some platforms, such as Jython, is missing fcntl
1435 def _lock_file(f, exclusive):
1436 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1438 def _unlock_file(f):
1439 fcntl.flock(f, fcntl.LOCK_UN)
1441 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1443 def _lock_file(f, exclusive):
1444 raise IOError(UNSUPPORTED_MSG)
1446 def _unlock_file(f):
1447 raise IOError(UNSUPPORTED_MSG)
1450 class locked_file(object):
1451 def __init__(self, filename, mode, encoding=None):
1452 assert mode in ['r', 'a', 'w']
1453 self.f = io.open(filename, mode, encoding=encoding)
1456 def __enter__(self):
1457 exclusive = self.mode != 'r'
1459 _lock_file(self.f, exclusive)
1465 def __exit__(self, etype, value, traceback):
1467 _unlock_file(self.f)
1474 def write(self, *args):
1475 return self.f.write(*args)
1477 def read(self, *args):
1478 return self.f.read(*args)
1481 def get_filesystem_encoding():
1482 encoding = sys.getfilesystemencoding()
1483 return encoding if encoding is not None else 'utf-8'
1486 def shell_quote(args):
1488 encoding = get_filesystem_encoding()
1490 if isinstance(a, bytes):
1491 # We may get a filename encoded with 'encodeFilename'
1492 a = a.decode(encoding)
1493 quoted_args.append(pipes.quote(a))
1494 return ' '.join(quoted_args)
1497 def smuggle_url(url, data):
1498 """ Pass additional data in a URL for internal use. """
1500 url, idata = unsmuggle_url(url, {})
1502 sdata = compat_urllib_parse_urlencode(
1503 {'__youtubedl_smuggle': json.dumps(data)})
1504 return url + '#' + sdata
1507 def unsmuggle_url(smug_url, default=None):
1508 if '#__youtubedl_smuggle' not in smug_url:
1509 return smug_url, default
1510 url, _, sdata = smug_url.rpartition('#')
1511 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1512 data = json.loads(jsond)
1516 def format_bytes(bytes):
1519 if type(bytes) is str:
1520 bytes = float(bytes)
1524 exponent = int(math.log(bytes, 1024.0))
1525 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1526 converted = float(bytes) / float(1024 ** exponent)
1527 return '%.2f%s' % (converted, suffix)
1530 def lookup_unit_table(unit_table, s):
1531 units_re = '|'.join(re.escape(u) for u in unit_table)
1533 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1536 num_str = m.group('num').replace(',', '.')
1537 mult = unit_table[m.group('unit')]
1538 return int(float(num_str) * mult)
1541 def parse_filesize(s):
1545 # The lower-case forms are of course incorrect and unofficial,
1546 # but we support those too
1563 'megabytes': 1000 ** 2,
1564 'mebibytes': 1024 ** 2,
1570 'gigabytes': 1000 ** 3,
1571 'gibibytes': 1024 ** 3,
1577 'terabytes': 1000 ** 4,
1578 'tebibytes': 1024 ** 4,
1584 'petabytes': 1000 ** 5,
1585 'pebibytes': 1024 ** 5,
1591 'exabytes': 1000 ** 6,
1592 'exbibytes': 1024 ** 6,
1598 'zettabytes': 1000 ** 7,
1599 'zebibytes': 1024 ** 7,
1605 'yottabytes': 1000 ** 8,
1606 'yobibytes': 1024 ** 8,
1609 return lookup_unit_table(_UNIT_TABLE, s)
1618 if re.match(r'^[\d,.]+$', s):
1619 return str_to_int(s)
1630 return lookup_unit_table(_UNIT_TABLE, s)
1633 def month_by_name(name, lang='en'):
1634 """ Return the number of a month by (locale-independently) English name """
1636 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1639 return month_names.index(name) + 1
1644 def month_by_abbreviation(abbrev):
1645 """ Return the number of a month by (locale-independently) English
1649 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1654 def fix_xml_ampersands(xml_str):
1655 """Replace all the '&' by '&' in XML"""
1657 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1662 def setproctitle(title):
1663 assert isinstance(title, compat_str)
1665 # ctypes in Jython is not complete
1666 # http://bugs.jython.org/issue2148
1667 if sys.platform.startswith('java'):
1671 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1674 title_bytes = title.encode('utf-8')
1675 buf = ctypes.create_string_buffer(len(title_bytes))
1676 buf.value = title_bytes
1678 libc.prctl(15, buf, 0, 0, 0)
1679 except AttributeError:
1680 return # Strange libc, just skip this
1683 def remove_start(s, start):
1684 return s[len(start):] if s is not None and s.startswith(start) else s
1687 def remove_end(s, end):
1688 return s[:-len(end)] if s is not None and s.endswith(end) else s
1691 def remove_quotes(s):
1692 if s is None or len(s) < 2:
1694 for quote in ('"', "'", ):
1695 if s[0] == quote and s[-1] == quote:
1700 def url_basename(url):
1701 path = compat_urlparse.urlparse(url).path
1702 return path.strip('/').split('/')[-1]
1706 return re.match(r'https?://[^?#&]+/', url).group()
1709 def urljoin(base, path):
1710 if not isinstance(path, compat_str) or not path:
1712 if re.match(r'^(?:https?:)?//', path):
1714 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1716 return compat_urlparse.urljoin(base, path)
1719 class HEADRequest(compat_urllib_request.Request):
1720 def get_method(self):
1724 class PUTRequest(compat_urllib_request.Request):
1725 def get_method(self):
1729 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1732 v = getattr(v, get_attr, None)
1738 return int(v) * invscale // scale
1743 def str_or_none(v, default=None):
1744 return default if v is None else compat_str(v)
1747 def str_to_int(int_str):
1748 """ A more relaxed version of int_or_none """
1751 int_str = re.sub(r'[,\.\+]', '', int_str)
1755 def float_or_none(v, scale=1, invscale=1, default=None):
1759 return float(v) * invscale / scale
1764 def strip_or_none(v):
1765 return None if v is None else v.strip()
1768 def parse_duration(s):
1769 if not isinstance(s, compat_basestring):
1774 days, hours, mins, secs, ms = [None] * 5
1775 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1777 days, hours, mins, secs, ms = m.groups()
1782 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1785 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1788 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1791 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1794 days, hours, mins, secs, ms = m.groups()
1796 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1798 hours, mins = m.groups()
1804 duration += float(secs)
1806 duration += float(mins) * 60
1808 duration += float(hours) * 60 * 60
1810 duration += float(days) * 24 * 60 * 60
1812 duration += float(ms)
1816 def prepend_extension(filename, ext, expected_real_ext=None):
1817 name, real_ext = os.path.splitext(filename)
1819 '{0}.{1}{2}'.format(name, ext, real_ext)
1820 if not expected_real_ext or real_ext[1:] == expected_real_ext
1821 else '{0}.{1}'.format(filename, ext))
1824 def replace_extension(filename, ext, expected_real_ext=None):
1825 name, real_ext = os.path.splitext(filename)
1826 return '{0}.{1}'.format(
1827 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1831 def check_executable(exe, args=[]):
1832 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1833 args can be a list of arguments for a short output (like -version) """
1835 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1841 def get_exe_version(exe, args=['--version'],
1842 version_re=None, unrecognized='present'):
1843 """ Returns the version of the specified executable,
1844 or False if the executable is not present """
1846 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1847 # SIGTTOU if youtube-dl is run in the background.
1848 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1849 out, _ = subprocess.Popen(
1850 [encodeArgument(exe)] + args,
1851 stdin=subprocess.PIPE,
1852 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1855 if isinstance(out, bytes): # Python 2.x
1856 out = out.decode('ascii', 'ignore')
1857 return detect_exe_version(out, version_re, unrecognized)
1860 def detect_exe_version(output, version_re=None, unrecognized='present'):
1861 assert isinstance(output, compat_str)
1862 if version_re is None:
1863 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1864 m = re.search(version_re, output)
1871 class PagedList(object):
1873 # This is only useful for tests
1874 return len(self.getslice())
1877 class OnDemandPagedList(PagedList):
1878 def __init__(self, pagefunc, pagesize, use_cache=False):
1879 self._pagefunc = pagefunc
1880 self._pagesize = pagesize
1881 self._use_cache = use_cache
1885 def getslice(self, start=0, end=None):
1887 for pagenum in itertools.count(start // self._pagesize):
1888 firstid = pagenum * self._pagesize
1889 nextfirstid = pagenum * self._pagesize + self._pagesize
1890 if start >= nextfirstid:
1895 page_results = self._cache.get(pagenum)
1896 if page_results is None:
1897 page_results = list(self._pagefunc(pagenum))
1899 self._cache[pagenum] = page_results
1902 start % self._pagesize
1903 if firstid <= start < nextfirstid
1907 ((end - 1) % self._pagesize) + 1
1908 if (end is not None and firstid <= end <= nextfirstid)
1911 if startv != 0 or endv is not None:
1912 page_results = page_results[startv:endv]
1913 res.extend(page_results)
1915 # A little optimization - if current page is not "full", ie. does
1916 # not contain page_size videos then we can assume that this page
1917 # is the last one - there are no more ids on further pages -
1918 # i.e. no need to query again.
1919 if len(page_results) + startv < self._pagesize:
1922 # If we got the whole page, but the next page is not interesting,
1923 # break out early as well
1924 if end == nextfirstid:
1929 class InAdvancePagedList(PagedList):
1930 def __init__(self, pagefunc, pagecount, pagesize):
1931 self._pagefunc = pagefunc
1932 self._pagecount = pagecount
1933 self._pagesize = pagesize
1935 def getslice(self, start=0, end=None):
1937 start_page = start // self._pagesize
1939 self._pagecount if end is None else (end // self._pagesize + 1))
1940 skip_elems = start - start_page * self._pagesize
1941 only_more = None if end is None else end - start
1942 for pagenum in range(start_page, end_page):
1943 page = list(self._pagefunc(pagenum))
1945 page = page[skip_elems:]
1947 if only_more is not None:
1948 if len(page) < only_more:
1949 only_more -= len(page)
1951 page = page[:only_more]
1958 def uppercase_escape(s):
1959 unicode_escape = codecs.getdecoder('unicode_escape')
1961 r'\\U[0-9a-fA-F]{8}',
1962 lambda m: unicode_escape(m.group(0))[0],
1966 def lowercase_escape(s):
1967 unicode_escape = codecs.getdecoder('unicode_escape')
1969 r'\\u[0-9a-fA-F]{4}',
1970 lambda m: unicode_escape(m.group(0))[0],
1974 def escape_rfc3986(s):
1975 """Escape non-ASCII characters as suggested by RFC 3986"""
1976 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1977 s = s.encode('utf-8')
1978 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1981 def escape_url(url):
1982 """Escape URL as suggested by RFC 3986"""
1983 url_parsed = compat_urllib_parse_urlparse(url)
1984 return url_parsed._replace(
1985 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1986 path=escape_rfc3986(url_parsed.path),
1987 params=escape_rfc3986(url_parsed.params),
1988 query=escape_rfc3986(url_parsed.query),
1989 fragment=escape_rfc3986(url_parsed.fragment)
1993 def read_batch_urls(batch_fd):
1995 if not isinstance(url, compat_str):
1996 url = url.decode('utf-8', 'replace')
1997 BOM_UTF8 = '\xef\xbb\xbf'
1998 if url.startswith(BOM_UTF8):
1999 url = url[len(BOM_UTF8):]
2001 if url.startswith(('#', ';', ']')):
2005 with contextlib.closing(batch_fd) as fd:
2006 return [url for url in map(fixup, fd) if url]
2009 def urlencode_postdata(*args, **kargs):
2010 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2013 def update_url_query(url, query):
2016 parsed_url = compat_urlparse.urlparse(url)
2017 qs = compat_parse_qs(parsed_url.query)
2019 return compat_urlparse.urlunparse(parsed_url._replace(
2020 query=compat_urllib_parse_urlencode(qs, True)))
2023 def update_Request(req, url=None, data=None, headers={}, query={}):
2024 req_headers = req.headers.copy()
2025 req_headers.update(headers)
2026 req_data = data or req.data
2027 req_url = update_url_query(url or req.get_full_url(), query)
2028 req_get_method = req.get_method()
2029 if req_get_method == 'HEAD':
2030 req_type = HEADRequest
2031 elif req_get_method == 'PUT':
2032 req_type = PUTRequest
2034 req_type = compat_urllib_request.Request
2036 req_url, data=req_data, headers=req_headers,
2037 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2038 if hasattr(req, 'timeout'):
2039 new_req.timeout = req.timeout
2043 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2044 if isinstance(key_or_keys, (list, tuple)):
2045 for key in key_or_keys:
2046 if key not in d or d[key] is None or skip_false_values and not d[key]:
2050 return d.get(key_or_keys, default)
2053 def try_get(src, getter, expected_type=None):
2056 except (AttributeError, KeyError, TypeError, IndexError):
2059 if expected_type is None or isinstance(v, expected_type):
2063 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2064 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2076 TV_PARENTAL_GUIDELINES = {
2086 def parse_age_limit(s):
2088 return s if 0 <= s <= 21 else None
2089 if not isinstance(s, compat_basestring):
2091 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2093 return int(m.group('age'))
2095 return US_RATINGS[s]
2096 return TV_PARENTAL_GUIDELINES.get(s)
2099 def strip_jsonp(code):
2101 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2104 def js_to_json(code):
2107 if v in ('true', 'false', 'null'):
2109 elif v.startswith('/*') or v == ',':
2112 if v[0] in ("'", '"'):
2113 v = re.sub(r'(?s)\\.|"', lambda m: {
2118 }.get(m.group(0), m.group(0)), v[1:-1])
2121 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2122 (r'^(0+[0-7]+)\s*:?$', 8),
2125 for regex, base in INTEGER_TABLE:
2126 im = re.match(regex, v)
2128 i = int(im.group(1), base)
2129 return '"%d":' % i if v.endswith(':') else '%d' % i
2133 return re.sub(r'''(?sx)
2134 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2135 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2136 /\*.*?\*/|,(?=\s*[\]}])|
2137 [a-zA-Z_][.a-zA-Z_0-9]*|
2138 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2143 def qualities(quality_ids):
2144 """ Get a numeric quality value out of a list of possible values """
2147 return quality_ids.index(qid)
2153 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2156 def limit_length(s, length):
2157 """ Add ellipses to overly long strings """
2162 return s[:length - len(ELLIPSES)] + ELLIPSES
2166 def version_tuple(v):
2167 return tuple(int(e) for e in re.split(r'[-.]', v))
2170 def is_outdated_version(version, limit, assume_new=True):
2172 return not assume_new
2174 return version_tuple(version) < version_tuple(limit)
2176 return not assume_new
2179 def ytdl_is_updateable():
2180 """ Returns if youtube-dl can be updated with -U """
2181 from zipimport import zipimporter
2183 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2186 def args_to_str(args):
2187 # Get a short string representation for a subprocess command
2188 return ' '.join(compat_shlex_quote(a) for a in args)
2191 def error_to_compat_str(err):
2193 # On python 2 error byte string must be decoded with proper
2194 # encoding rather than ascii
2195 if sys.version_info[0] < 3:
2196 err_str = err_str.decode(preferredencoding())
2200 def mimetype2ext(mt):
2206 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2207 # it's the most popular one
2208 'audio/mpeg': 'mp3',
2213 _, _, res = mt.rpartition('/')
2214 res = res.split(';')[0].strip().lower()
2218 'smptett+xml': 'tt',
2224 'x-mp4-fragmented': 'mp4',
2227 'x-mpegurl': 'm3u8',
2228 'vnd.apple.mpegurl': 'm3u8',
2233 'vnd.ms-sstr+xml': 'ism',
2238 def parse_codecs(codecs_str):
2239 # http://tools.ietf.org/html/rfc6381
2242 splited_codecs = list(filter(None, map(
2243 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2244 vcodec, acodec = None, None
2245 for full_codec in splited_codecs:
2246 codec = full_codec.split('.')[0]
2247 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2250 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2254 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2255 if not vcodec and not acodec:
2256 if len(splited_codecs) == 2:
2261 elif len(splited_codecs) == 1:
2268 'vcodec': vcodec or 'none',
2269 'acodec': acodec or 'none',
2274 def urlhandle_detect_ext(url_handle):
2275 getheader = url_handle.headers.get
2277 cd = getheader('Content-Disposition')
2279 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2281 e = determine_ext(m.group('filename'), default_ext=None)
2285 return mimetype2ext(getheader('Content-Type'))
2288 def encode_data_uri(data, mime_type):
2289 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2292 def age_restricted(content_limit, age_limit):
2293 """ Returns True iff the content should be blocked """
2295 if age_limit is None: # No limit set
2297 if content_limit is None:
2298 return False # Content available for everyone
2299 return age_limit < content_limit
2302 def is_html(first_bytes):
2303 """ Detect whether a file contains HTML by examining its first bytes. """
2306 (b'\xef\xbb\xbf', 'utf-8'),
2307 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2308 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2309 (b'\xff\xfe', 'utf-16-le'),
2310 (b'\xfe\xff', 'utf-16-be'),
2312 for bom, enc in BOMS:
2313 if first_bytes.startswith(bom):
2314 s = first_bytes[len(bom):].decode(enc, 'replace')
2317 s = first_bytes.decode('utf-8', 'replace')
2319 return re.match(r'^\s*<', s)
2322 def determine_protocol(info_dict):
2323 protocol = info_dict.get('protocol')
2324 if protocol is not None:
2327 url = info_dict['url']
2328 if url.startswith('rtmp'):
2330 elif url.startswith('mms'):
2332 elif url.startswith('rtsp'):
2335 ext = determine_ext(url)
2341 return compat_urllib_parse_urlparse(url).scheme
2344 def render_table(header_row, data):
2345 """ Render a list of rows, each as a list of values """
2346 table = [header_row] + data
2347 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2348 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2349 return '\n'.join(format_str % tuple(row) for row in table)
2352 def _match_one(filter_part, dct):
2353 COMPARISON_OPERATORS = {
2361 operator_rex = re.compile(r'''(?x)\s*
2363 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2365 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2366 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2369 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2370 m = operator_rex.search(filter_part)
2372 op = COMPARISON_OPERATORS[m.group('op')]
2373 actual_value = dct.get(m.group('key'))
2374 if (m.group('strval') is not None or
2375 # If the original field is a string and matching comparisonvalue is
2376 # a number we should respect the origin of the original field
2377 # and process comparison value as a string (see
2378 # https://github.com/rg3/youtube-dl/issues/11082).
2379 actual_value is not None and m.group('intval') is not None and
2380 isinstance(actual_value, compat_str)):
2381 if m.group('op') not in ('=', '!='):
2383 'Operator %s does not support string values!' % m.group('op'))
2384 comparison_value = m.group('strval') or m.group('intval')
2387 comparison_value = int(m.group('intval'))
2389 comparison_value = parse_filesize(m.group('intval'))
2390 if comparison_value is None:
2391 comparison_value = parse_filesize(m.group('intval') + 'B')
2392 if comparison_value is None:
2394 'Invalid integer value %r in filter part %r' % (
2395 m.group('intval'), filter_part))
2396 if actual_value is None:
2397 return m.group('none_inclusive')
2398 return op(actual_value, comparison_value)
2401 '': lambda v: v is not None,
2402 '!': lambda v: v is None,
2404 operator_rex = re.compile(r'''(?x)\s*
2405 (?P<op>%s)\s*(?P<key>[a-z_]+)
2407 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2408 m = operator_rex.search(filter_part)
2410 op = UNARY_OPERATORS[m.group('op')]
2411 actual_value = dct.get(m.group('key'))
2412 return op(actual_value)
2414 raise ValueError('Invalid filter part %r' % filter_part)
2417 def match_str(filter_str, dct):
2418 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2421 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2424 def match_filter_func(filter_str):
2425 def _match_func(info_dict):
2426 if match_str(filter_str, info_dict):
2429 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2430 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2434 def parse_dfxp_time_expr(time_expr):
2438 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2440 return float(mobj.group('time_offset'))
2442 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2444 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2447 def srt_subtitles_timecode(seconds):
2448 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2451 def dfxp2srt(dfxp_data):
2452 _x = functools.partial(xpath_with_ns, ns_map={
2453 'ttml': 'http://www.w3.org/ns/ttml',
2454 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2455 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2458 class TTMLPElementParser(object):
2461 def start(self, tag, attrib):
2462 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2468 def data(self, data):
2472 return self.out.strip()
2474 def parse_node(node):
2475 target = TTMLPElementParser()
2476 parser = xml.etree.ElementTree.XMLParser(target=target)
2477 parser.feed(xml.etree.ElementTree.tostring(node))
2478 return parser.close()
2480 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2482 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2485 raise ValueError('Invalid dfxp/TTML subtitle')
2487 for para, index in zip(paras, itertools.count(1)):
2488 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2489 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2490 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2491 if begin_time is None:
2496 end_time = begin_time + dur
2497 out.append('%d\n%s --> %s\n%s\n\n' % (
2499 srt_subtitles_timecode(begin_time),
2500 srt_subtitles_timecode(end_time),
2506 def cli_option(params, command_option, param):
2507 param = params.get(param)
2509 param = compat_str(param)
2510 return [command_option, param] if param is not None else []
2513 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2514 param = params.get(param)
2515 assert isinstance(param, bool)
2517 return [command_option + separator + (true_value if param else false_value)]
2518 return [command_option, true_value if param else false_value]
2521 def cli_valueless_option(params, command_option, param, expected_value=True):
2522 param = params.get(param)
2523 return [command_option] if param == expected_value else []
2526 def cli_configuration_args(params, param, default=[]):
2527 ex_args = params.get(param)
2530 assert isinstance(ex_args, list)
2534 class ISO639Utils(object):
2535 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2724 def short2long(cls, code):
2725 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2726 return cls._lang_map.get(code[:2])
2729 def long2short(cls, code):
2730 """Convert language code from ISO 639-2/T to ISO 639-1"""
2731 for short_name, long_name in cls._lang_map.items():
2732 if long_name == code:
2736 class ISO3166Utils(object):
2737 # From http://data.okfn.org/data/core/country-list
2739 'AF': 'Afghanistan',
2740 'AX': 'Åland Islands',
2743 'AS': 'American Samoa',
2748 'AG': 'Antigua and Barbuda',
2765 'BO': 'Bolivia, Plurinational State of',
2766 'BQ': 'Bonaire, Sint Eustatius and Saba',
2767 'BA': 'Bosnia and Herzegovina',
2769 'BV': 'Bouvet Island',
2771 'IO': 'British Indian Ocean Territory',
2772 'BN': 'Brunei Darussalam',
2774 'BF': 'Burkina Faso',
2780 'KY': 'Cayman Islands',
2781 'CF': 'Central African Republic',
2785 'CX': 'Christmas Island',
2786 'CC': 'Cocos (Keeling) Islands',
2790 'CD': 'Congo, the Democratic Republic of the',
2791 'CK': 'Cook Islands',
2793 'CI': 'Côte d\'Ivoire',
2798 'CZ': 'Czech Republic',
2802 'DO': 'Dominican Republic',
2805 'SV': 'El Salvador',
2806 'GQ': 'Equatorial Guinea',
2810 'FK': 'Falkland Islands (Malvinas)',
2811 'FO': 'Faroe Islands',
2815 'GF': 'French Guiana',
2816 'PF': 'French Polynesia',
2817 'TF': 'French Southern Territories',
2832 'GW': 'Guinea-Bissau',
2835 'HM': 'Heard Island and McDonald Islands',
2836 'VA': 'Holy See (Vatican City State)',
2843 'IR': 'Iran, Islamic Republic of',
2846 'IM': 'Isle of Man',
2856 'KP': 'Korea, Democratic People\'s Republic of',
2857 'KR': 'Korea, Republic of',
2860 'LA': 'Lao People\'s Democratic Republic',
2866 'LI': 'Liechtenstein',
2870 'MK': 'Macedonia, the Former Yugoslav Republic of',
2877 'MH': 'Marshall Islands',
2883 'FM': 'Micronesia, Federated States of',
2884 'MD': 'Moldova, Republic of',
2895 'NL': 'Netherlands',
2896 'NC': 'New Caledonia',
2897 'NZ': 'New Zealand',
2902 'NF': 'Norfolk Island',
2903 'MP': 'Northern Mariana Islands',
2908 'PS': 'Palestine, State of',
2910 'PG': 'Papua New Guinea',
2913 'PH': 'Philippines',
2917 'PR': 'Puerto Rico',
2921 'RU': 'Russian Federation',
2923 'BL': 'Saint Barthélemy',
2924 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2925 'KN': 'Saint Kitts and Nevis',
2926 'LC': 'Saint Lucia',
2927 'MF': 'Saint Martin (French part)',
2928 'PM': 'Saint Pierre and Miquelon',
2929 'VC': 'Saint Vincent and the Grenadines',
2932 'ST': 'Sao Tome and Principe',
2933 'SA': 'Saudi Arabia',
2937 'SL': 'Sierra Leone',
2939 'SX': 'Sint Maarten (Dutch part)',
2942 'SB': 'Solomon Islands',
2944 'ZA': 'South Africa',
2945 'GS': 'South Georgia and the South Sandwich Islands',
2946 'SS': 'South Sudan',
2951 'SJ': 'Svalbard and Jan Mayen',
2954 'CH': 'Switzerland',
2955 'SY': 'Syrian Arab Republic',
2956 'TW': 'Taiwan, Province of China',
2958 'TZ': 'Tanzania, United Republic of',
2960 'TL': 'Timor-Leste',
2964 'TT': 'Trinidad and Tobago',
2967 'TM': 'Turkmenistan',
2968 'TC': 'Turks and Caicos Islands',
2972 'AE': 'United Arab Emirates',
2973 'GB': 'United Kingdom',
2974 'US': 'United States',
2975 'UM': 'United States Minor Outlying Islands',
2979 'VE': 'Venezuela, Bolivarian Republic of',
2981 'VG': 'Virgin Islands, British',
2982 'VI': 'Virgin Islands, U.S.',
2983 'WF': 'Wallis and Futuna',
2984 'EH': 'Western Sahara',
2991 def short2full(cls, code):
2992 """Convert an ISO 3166-2 country code to the corresponding full name"""
2993 return cls._country_map.get(code.upper())
2996 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2997 def __init__(self, proxies=None):
2998 # Set default handlers
2999 for type in ('http', 'https'):
3000 setattr(self, '%s_open' % type,
3001 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3002 meth(r, proxy, type))
3003 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3005 def proxy_open(self, req, proxy, type):
3006 req_proxy = req.headers.get('Ytdl-request-proxy')
3007 if req_proxy is not None:
3009 del req.headers['Ytdl-request-proxy']
3011 if proxy == '__noproxy__':
3012 return None # No Proxy
3013 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3014 req.add_header('Ytdl-socks-proxy', proxy)
3015 # youtube-dl's http/https handlers do wrapping the socket with socks
3017 return compat_urllib_request.ProxyHandler.proxy_open(
3018 self, req, proxy, type)
3021 def ohdave_rsa_encrypt(data, exponent, modulus):
3023 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3026 data: data to encrypt, bytes-like object
3027 exponent, modulus: parameter e and N of RSA algorithm, both integer
3028 Output: hex string of encrypted data
3030 Limitation: supports one block encryption only
3033 payload = int(binascii.hexlify(data[::-1]), 16)
3034 encrypted = pow(payload, exponent, modulus)
3035 return '%x' % encrypted
3038 def encode_base_n(num, n, table=None):
3039 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3041 table = FULL_TABLE[:n]
3044 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3051 ret = table[num % n] + ret
3056 def decode_packed_codes(code):
3057 mobj = re.search(PACKED_CODES_RE, code)
3058 obfucasted_code, base, count, symbols = mobj.groups()
3061 symbols = symbols.split('|')
3066 base_n_count = encode_base_n(count, base)
3067 symbol_table[base_n_count] = symbols[count] or base_n_count
3070 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3074 def parse_m3u8_attributes(attrib):
3076 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3077 if val.startswith('"'):
3083 def urshift(val, n):
3084 return val >> n if val >= 0 else (val + 0x100000000) >> n
3087 # Based on png2str() written by @gdkchan and improved by @yokrysty
3088 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3089 def decode_png(png_data):
3090 # Reference: https://www.w3.org/TR/PNG/
3091 header = png_data[8:]
3093 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3094 raise IOError('Not a valid PNG file.')
3096 int_map = {1: '>B', 2: '>H', 4: '>I'}
3097 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3102 length = unpack_integer(header[:4])
3105 chunk_type = header[:4]
3108 chunk_data = header[:length]
3109 header = header[length:]
3111 header = header[4:] # Skip CRC
3119 ihdr = chunks[0]['data']
3121 width = unpack_integer(ihdr[:4])
3122 height = unpack_integer(ihdr[4:8])
3126 for chunk in chunks:
3127 if chunk['type'] == b'IDAT':
3128 idat += chunk['data']
3131 raise IOError('Unable to read PNG data.')
3133 decompressed_data = bytearray(zlib.decompress(idat))
3138 def _get_pixel(idx):
3143 for y in range(height):
3144 basePos = y * (1 + stride)
3145 filter_type = decompressed_data[basePos]
3149 pixels.append(current_row)
3151 for x in range(stride):
3152 color = decompressed_data[1 + basePos + x]
3153 basex = y * stride + x
3158 left = _get_pixel(basex - 3)
3160 up = _get_pixel(basex - stride)
3162 if filter_type == 1: # Sub
3163 color = (color + left) & 0xff
3164 elif filter_type == 2: # Up
3165 color = (color + up) & 0xff
3166 elif filter_type == 3: # Average
3167 color = (color + ((left + up) >> 1)) & 0xff
3168 elif filter_type == 4: # Paeth
3174 c = _get_pixel(basex - stride - 3)
3182 if pa <= pb and pa <= pc:
3183 color = (color + a) & 0xff
3185 color = (color + b) & 0xff
3187 color = (color + c) & 0xff
3189 current_row.append(color)
3191 return width, height, pixels
3194 def write_xattr(path, key, value):
3195 # This mess below finds the best xattr tool for the job
3197 # try the pyxattr module...
3200 if hasattr(xattr, 'set'): # pyxattr
3201 # Unicode arguments are not supported in python-pyxattr until
3203 # See https://github.com/rg3/youtube-dl/issues/5498
3204 pyxattr_required_version = '0.5.0'
3205 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3206 # TODO: fallback to CLI tools
3207 raise XAttrUnavailableError(
3208 'python-pyxattr is detected but is too old. '
3209 'youtube-dl requires %s or above while your version is %s. '
3210 'Falling back to other xattr implementations' % (
3211 pyxattr_required_version, xattr.__version__))
3213 setxattr = xattr.set
3215 setxattr = xattr.setxattr
3218 setxattr(path, key, value)
3219 except EnvironmentError as e:
3220 raise XAttrMetadataError(e.errno, e.strerror)
3223 if compat_os_name == 'nt':
3224 # Write xattrs to NTFS Alternate Data Streams:
3225 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3226 assert ':' not in key
3227 assert os.path.exists(path)
3229 ads_fn = path + ':' + key
3231 with open(ads_fn, 'wb') as f:
3233 except EnvironmentError as e:
3234 raise XAttrMetadataError(e.errno, e.strerror)
3236 user_has_setfattr = check_executable('setfattr', ['--version'])
3237 user_has_xattr = check_executable('xattr', ['-h'])
3239 if user_has_setfattr or user_has_xattr:
3241 value = value.decode('utf-8')
3242 if user_has_setfattr:
3243 executable = 'setfattr'
3244 opts = ['-n', key, '-v', value]
3245 elif user_has_xattr:
3246 executable = 'xattr'
3247 opts = ['-w', key, value]
3249 cmd = ([encodeFilename(executable, True)] +
3250 [encodeArgument(o) for o in opts] +
3251 [encodeFilename(path, True)])
3254 p = subprocess.Popen(
3255 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3256 except EnvironmentError as e:
3257 raise XAttrMetadataError(e.errno, e.strerror)
3258 stdout, stderr = p.communicate()
3259 stderr = stderr.decode('utf-8', 'replace')
3260 if p.returncode != 0:
3261 raise XAttrMetadataError(p.returncode, stderr)
3264 # On Unix, and can't find pyxattr, setfattr, or xattr.
3265 if sys.platform.startswith('linux'):
3266 raise XAttrUnavailableError(
3267 "Couldn't find a tool to set the xattrs. "
3268 "Install either the python 'pyxattr' or 'xattr' "
3269 "modules, or the GNU 'attr' package "
3270 "(which contains the 'setfattr' tool).")
3272 raise XAttrUnavailableError(
3273 "Couldn't find a tool to set the xattrs. "
3274 "Install either the python 'xattr' module, "
3275 "or the 'xattr' binary.")