4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
42 compat_html_entities_html5,
48 compat_socket_create_connection,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
96 ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
101 'en': ENGLISH_MONTH_NAMES,
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
120 'f4f', 'f4m', 'm3u8', 'smil')
122 # needed for sanitizing filenames in restricted mode
123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
148 '%Y-%m-%d %H:%M:%S.%f',
151 '%Y-%m-%dT%H:%M:%SZ',
152 '%Y-%m-%dT%H:%M:%S.%fZ',
153 '%Y-%m-%dT%H:%M:%S.%f0Z',
155 '%Y-%m-%dT%H:%M:%S.%f',
158 '%b %d %Y at %H:%M:%S',
161 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
162 DATE_FORMATS_DAY_FIRST.extend([
171 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
172 DATE_FORMATS_MONTH_FIRST.extend([
180 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
183 def preferredencoding():
184 """Get preferred encoding.
186 Returns the best encoding scheme for the system, based on
187 locale.getpreferredencoding() and some further tweaks.
190 pref = locale.getpreferredencoding()
198 def write_json_file(obj, fn):
199 """ Encode obj as JSON and write it to fn, atomically if possible """
201 fn = encodeFilename(fn)
202 if sys.version_info < (3, 0) and sys.platform != 'win32':
203 encoding = get_filesystem_encoding()
204 # os.path.basename returns a bytes object, but NamedTemporaryFile
205 # will fail if the filename contains non ascii characters unless we
206 # use a unicode object
207 path_basename = lambda f: os.path.basename(fn).decode(encoding)
208 # the same for os.path.dirname
209 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
211 path_basename = os.path.basename
212 path_dirname = os.path.dirname
216 'prefix': path_basename(fn) + '.',
217 'dir': path_dirname(fn),
221 # In Python 2.x, json.dump expects a bytestream.
222 # In Python 3.x, it writes to a character stream
223 if sys.version_info < (3, 0):
231 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
236 if sys.platform == 'win32':
237 # Need to remove existing file on Windows, else os.rename raises
238 # WindowsError or FileExistsError.
243 os.rename(tf.name, fn)
252 if sys.version_info >= (2, 7):
253 def find_xpath_attr(node, xpath, key, val=None):
254 """ Find the xpath xpath[@key=val] """
255 assert re.match(r'^[a-zA-Z_-]+$', key)
256 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
257 return node.find(expr)
259 def find_xpath_attr(node, xpath, key, val=None):
260 for f in node.findall(compat_xpath(xpath)):
261 if key not in f.attrib:
263 if val is None or f.attrib.get(key) == val:
267 # On python2.6 the xml.etree.ElementTree.Element methods don't support
268 # the namespace parameter
271 def xpath_with_ns(path, ns_map):
272 components = [c.split(':') for c in path.split('/')]
276 replaced.append(c[0])
279 replaced.append('{%s}%s' % (ns_map[ns], tag))
280 return '/'.join(replaced)
283 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
284 def _find_xpath(xpath):
285 return node.find(compat_xpath(xpath))
287 if isinstance(xpath, (str, compat_str)):
288 n = _find_xpath(xpath)
296 if default is not NO_DEFAULT:
299 name = xpath if name is None else name
300 raise ExtractorError('Could not find XML element %s' % name)
306 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
307 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
308 if n is None or n == default:
311 if default is not NO_DEFAULT:
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element\'s text %s' % name)
321 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
322 n = find_xpath_attr(node, xpath, key)
324 if default is not NO_DEFAULT:
327 name = '%s[@%s]' % (xpath, key) if name is None else name
328 raise ExtractorError('Could not find XML attribute %s' % name)
334 def get_element_by_id(id, html):
335 """Return the content of the tag with the specified ID in the passed HTML document"""
336 return get_element_by_attribute('id', id, html)
339 def get_element_by_class(class_name, html):
340 """Return the content of the first tag with the specified class in the passed HTML document"""
341 retval = get_elements_by_class(class_name, html)
342 return retval[0] if retval else None
345 def get_element_by_attribute(attribute, value, html, escape_value=True):
346 retval = get_elements_by_attribute(attribute, value, html, escape_value)
347 return retval[0] if retval else None
350 def get_elements_by_class(class_name, html):
351 """Return the content of all tags with the specified class in the passed HTML document as a list"""
352 return get_elements_by_attribute(
353 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
354 html, escape_value=False)
357 def get_elements_by_attribute(attribute, value, html, escape_value=True):
358 """Return the content of the tag with the specified attribute in the passed HTML document"""
360 value = re.escape(value) if escape_value else value
363 for m in re.finditer(r'''(?xs)
365 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
367 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
371 ''' % (re.escape(attribute), value), html):
372 res = m.group('content')
374 if res.startswith('"') or res.startswith("'"):
377 retlist.append(unescapeHTML(res))
382 class HTMLAttributeParser(compat_HTMLParser):
383 """Trivial HTML parser to gather the attributes for a single element"""
386 compat_HTMLParser.__init__(self)
388 def handle_starttag(self, tag, attrs):
389 self.attrs = dict(attrs)
392 def extract_attributes(html_element):
393 """Given a string for an HTML element such as
395 a="foo" B="bar" c="&98;az" d=boz
396 empty= noval entity="&"
399 Decode and return a dictionary of attributes.
401 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
402 'empty': '', 'noval': None, 'entity': '&',
403 'sq': '"', 'dq': '\''
405 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
406 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
408 parser = HTMLAttributeParser()
409 parser.feed(html_element)
414 def clean_html(html):
415 """Clean an HTML snippet into a readable string"""
417 if html is None: # Convenience for sanitizing descriptions etc.
421 html = html.replace('\n', ' ')
422 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
423 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
425 html = re.sub('<.*?>', '', html)
426 # Replace html entities
427 html = unescapeHTML(html)
431 def sanitize_open(filename, open_mode):
432 """Try to open the given filename, and slightly tweak it if this fails.
434 Attempts to open the given filename. If this fails, it tries to change
435 the filename slightly, step by step, until it's either able to open it
436 or it fails and raises a final exception, like the standard open()
439 It returns the tuple (stream, definitive_file_name).
443 if sys.platform == 'win32':
445 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
446 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
447 stream = open(encodeFilename(filename), open_mode)
448 return (stream, filename)
449 except (IOError, OSError) as err:
450 if err.errno in (errno.EACCES,):
453 # In case of error, try to remove win32 forbidden chars
454 alt_filename = sanitize_path(filename)
455 if alt_filename == filename:
458 # An exception here should be caught in the caller
459 stream = open(encodeFilename(alt_filename), open_mode)
460 return (stream, alt_filename)
463 def timeconvert(timestr):
464 """Convert RFC 2822 defined time string into system timestamp"""
466 timetuple = email.utils.parsedate_tz(timestr)
467 if timetuple is not None:
468 timestamp = email.utils.mktime_tz(timetuple)
472 def sanitize_filename(s, restricted=False, is_id=False):
473 """Sanitizes a string so it could be used as part of a filename.
474 If restricted is set, use a stricter subset of allowed characters.
475 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
477 def replace_insane(char):
478 if restricted and char in ACCENT_CHARS:
479 return ACCENT_CHARS[char]
480 if char == '?' or ord(char) < 32 or ord(char) == 127:
483 return '' if restricted else '\''
485 return '_-' if restricted else ' -'
486 elif char in '\\/|*<>':
488 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
490 if restricted and ord(char) > 127:
495 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
496 result = ''.join(map(replace_insane, s))
498 while '__' in result:
499 result = result.replace('__', '_')
500 result = result.strip('_')
501 # Common case of "Foreign band name - English song title"
502 if restricted and result.startswith('-_'):
504 if result.startswith('-'):
505 result = '_' + result[len('-'):]
506 result = result.lstrip('.')
512 def sanitize_path(s):
513 """Sanitizes and normalizes path on Windows"""
514 if sys.platform != 'win32':
516 drive_or_unc, _ = os.path.splitdrive(s)
517 if sys.version_info < (2, 7) and not drive_or_unc:
518 drive_or_unc, _ = os.path.splitunc(s)
519 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
523 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
524 for path_part in norm_path]
526 sanitized_path.insert(0, drive_or_unc + os.path.sep)
527 return os.path.join(*sanitized_path)
530 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
531 # unwanted failures due to missing protocol
532 def sanitize_url(url):
533 return 'http:%s' % url if url.startswith('//') else url
536 def sanitized_Request(url, *args, **kwargs):
537 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
540 def orderedSet(iterable):
541 """ Remove all duplicates from the input iterable """
549 def _htmlentity_transform(entity_with_semicolon):
550 """Transforms an HTML entity to a character."""
551 entity = entity_with_semicolon[:-1]
553 # Known non-numeric HTML entity
554 if entity in compat_html_entities.name2codepoint:
555 return compat_chr(compat_html_entities.name2codepoint[entity])
557 # TODO: HTML5 allows entities without a semicolon. For example,
558 # 'Éric' should be decoded as 'Éric'.
559 if entity_with_semicolon in compat_html_entities_html5:
560 return compat_html_entities_html5[entity_with_semicolon]
562 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
564 numstr = mobj.group(1)
565 if numstr.startswith('x'):
567 numstr = '0%s' % numstr
570 # See https://github.com/rg3/youtube-dl/issues/7518
572 return compat_chr(int(numstr, base))
576 # Unknown entity in name, return its literal representation
577 return '&%s;' % entity
583 assert type(s) == compat_str
586 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
589 def get_subprocess_encoding():
590 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
591 # For subprocess calls, encode with locale encoding
592 # Refer to http://stackoverflow.com/a/9951851/35070
593 encoding = preferredencoding()
595 encoding = sys.getfilesystemencoding()
601 def encodeFilename(s, for_subprocess=False):
603 @param s The name of the file
606 assert type(s) == compat_str
608 # Python 3 has a Unicode API
609 if sys.version_info >= (3, 0):
612 # Pass '' directly to use Unicode APIs on Windows 2000 and up
613 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
614 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
615 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
618 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
619 if sys.platform.startswith('java'):
622 return s.encode(get_subprocess_encoding(), 'ignore')
625 def decodeFilename(b, for_subprocess=False):
627 if sys.version_info >= (3, 0):
630 if not isinstance(b, bytes):
633 return b.decode(get_subprocess_encoding(), 'ignore')
636 def encodeArgument(s):
637 if not isinstance(s, compat_str):
638 # Legacy code that uses byte strings
639 # Uncomment the following line after fixing all post processors
640 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
641 s = s.decode('ascii')
642 return encodeFilename(s, True)
645 def decodeArgument(b):
646 return decodeFilename(b, True)
649 def decodeOption(optval):
652 if isinstance(optval, bytes):
653 optval = optval.decode(preferredencoding())
655 assert isinstance(optval, compat_str)
659 def formatSeconds(secs):
661 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
663 return '%d:%02d' % (secs // 60, secs % 60)
668 def make_HTTPS_handler(params, **kwargs):
669 opts_no_check_certificate = params.get('nocheckcertificate', False)
670 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
671 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
672 if opts_no_check_certificate:
673 context.check_hostname = False
674 context.verify_mode = ssl.CERT_NONE
676 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
679 # (create_default_context present but HTTPSHandler has no context=)
682 if sys.version_info < (3, 2):
683 return YoutubeDLHTTPSHandler(params, **kwargs)
685 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
686 context.verify_mode = (ssl.CERT_NONE
687 if opts_no_check_certificate
688 else ssl.CERT_REQUIRED)
689 context.set_default_verify_paths()
690 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
693 def bug_reports_message():
694 if ytdl_is_updateable():
695 update_cmd = 'type youtube-dl -U to update'
697 update_cmd = 'see https://yt-dl.org/update on how to update'
698 msg = '; please report this issue on https://yt-dl.org/bug .'
699 msg += ' Make sure you are using the latest version; %s.' % update_cmd
700 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
704 class ExtractorError(Exception):
705 """Error during info extraction."""
707 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
708 """ tb, if given, is the original traceback (so that it can be printed out).
709 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
712 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
714 if video_id is not None:
715 msg = video_id + ': ' + msg
717 msg += ' (caused by %r)' % cause
719 msg += bug_reports_message()
720 super(ExtractorError, self).__init__(msg)
723 self.exc_info = sys.exc_info() # preserve original exception
725 self.video_id = video_id
727 def format_traceback(self):
728 if self.traceback is None:
730 return ''.join(traceback.format_tb(self.traceback))
733 class UnsupportedError(ExtractorError):
734 def __init__(self, url):
735 super(UnsupportedError, self).__init__(
736 'Unsupported URL: %s' % url, expected=True)
740 class RegexNotFoundError(ExtractorError):
741 """Error when a regex didn't match"""
745 class DownloadError(Exception):
746 """Download Error exception.
748 This exception may be thrown by FileDownloader objects if they are not
749 configured to continue on errors. They will contain the appropriate
753 def __init__(self, msg, exc_info=None):
754 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
755 super(DownloadError, self).__init__(msg)
756 self.exc_info = exc_info
759 class SameFileError(Exception):
760 """Same File exception.
762 This exception will be thrown by FileDownloader objects if they detect
763 multiple files would have to be downloaded to the same file on disk.
768 class PostProcessingError(Exception):
769 """Post Processing exception.
771 This exception may be raised by PostProcessor's .run() method to
772 indicate an error in the postprocessing task.
775 def __init__(self, msg):
779 class MaxDownloadsReached(Exception):
780 """ --max-downloads limit has been reached. """
784 class UnavailableVideoError(Exception):
785 """Unavailable Format exception.
787 This exception will be thrown when a video is requested
788 in a format that is not available for that video.
793 class ContentTooShortError(Exception):
794 """Content Too Short exception.
796 This exception may be raised by FileDownloader objects when a file they
797 download is too small for what the server announced first, indicating
798 the connection was probably interrupted.
801 def __init__(self, downloaded, expected):
803 self.downloaded = downloaded
804 self.expected = expected
807 class XAttrMetadataError(Exception):
808 def __init__(self, code=None, msg='Unknown error'):
809 super(XAttrMetadataError, self).__init__(msg)
813 # Parsing code and msg
814 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
815 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
816 self.reason = 'NO_SPACE'
817 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
818 self.reason = 'VALUE_TOO_LONG'
820 self.reason = 'NOT_SUPPORTED'
823 class XAttrUnavailableError(Exception):
827 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
828 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
829 # expected HTTP responses to meet HTTP/1.0 or later (see also
830 # https://github.com/rg3/youtube-dl/issues/6727)
831 if sys.version_info < (3, 0):
832 kwargs[b'strict'] = True
833 hc = http_class(*args, **kwargs)
834 source_address = ydl_handler._params.get('source_address')
835 if source_address is not None:
836 sa = (source_address, 0)
837 if hasattr(hc, 'source_address'): # Python 2.7+
838 hc.source_address = sa
840 def _hc_connect(self, *args, **kwargs):
841 sock = compat_socket_create_connection(
842 (self.host, self.port), self.timeout, sa)
844 self.sock = ssl.wrap_socket(
845 sock, self.key_file, self.cert_file,
846 ssl_version=ssl.PROTOCOL_TLSv1)
849 hc.connect = functools.partial(_hc_connect, hc)
854 def handle_youtubedl_headers(headers):
855 filtered_headers = headers
857 if 'Youtubedl-no-compression' in filtered_headers:
858 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
859 del filtered_headers['Youtubedl-no-compression']
861 return filtered_headers
864 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
865 """Handler for HTTP requests and responses.
867 This class, when installed with an OpenerDirector, automatically adds
868 the standard headers to every HTTP request and handles gzipped and
869 deflated responses from web servers. If compression is to be avoided in
870 a particular request, the original request in the program code only has
871 to include the HTTP header "Youtubedl-no-compression", which will be
872 removed before making the real request.
874 Part of this code was copied from:
876 http://techknack.net/python-urllib2-handlers/
878 Andrew Rowls, the author of that code, agreed to release it to the
882 def __init__(self, params, *args, **kwargs):
883 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
884 self._params = params
886 def http_open(self, req):
887 conn_class = compat_http_client.HTTPConnection
889 socks_proxy = req.headers.get('Ytdl-socks-proxy')
891 conn_class = make_socks_conn_class(conn_class, socks_proxy)
892 del req.headers['Ytdl-socks-proxy']
894 return self.do_open(functools.partial(
895 _create_http_connection, self, conn_class, False),
901 return zlib.decompress(data, -zlib.MAX_WBITS)
903 return zlib.decompress(data)
906 def addinfourl_wrapper(stream, headers, url, code):
907 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
908 return compat_urllib_request.addinfourl(stream, headers, url, code)
909 ret = compat_urllib_request.addinfourl(stream, headers, url)
913 def http_request(self, req):
914 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
915 # always respected by websites, some tend to give out URLs with non percent-encoded
916 # non-ASCII characters (see telemb.py, ard.py [#3412])
917 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
918 # To work around aforementioned issue we will replace request's original URL with
919 # percent-encoded one
920 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
921 # the code of this workaround has been moved here from YoutubeDL.urlopen()
922 url = req.get_full_url()
923 url_escaped = escape_url(url)
925 # Substitute URL if any change after escaping
926 if url != url_escaped:
927 req = update_Request(req, url=url_escaped)
929 for h, v in std_headers.items():
930 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
931 # The dict keys are capitalized because of this bug by urllib
932 if h.capitalize() not in req.headers:
935 req.headers = handle_youtubedl_headers(req.headers)
937 if sys.version_info < (2, 7) and '#' in req.get_full_url():
938 # Python 2.6 is brain-dead when it comes to fragments
939 req._Request__original = req._Request__original.partition('#')[0]
940 req._Request__r_type = req._Request__r_type.partition('#')[0]
944 def http_response(self, req, resp):
947 if resp.headers.get('Content-encoding', '') == 'gzip':
948 content = resp.read()
949 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
951 uncompressed = io.BytesIO(gz.read())
952 except IOError as original_ioerror:
953 # There may be junk add the end of the file
954 # See http://stackoverflow.com/q/4928560/35070 for details
955 for i in range(1, 1024):
957 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
958 uncompressed = io.BytesIO(gz.read())
963 raise original_ioerror
964 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
965 resp.msg = old_resp.msg
966 del resp.headers['Content-encoding']
968 if resp.headers.get('Content-encoding', '') == 'deflate':
969 gz = io.BytesIO(self.deflate(resp.read()))
970 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
971 resp.msg = old_resp.msg
972 del resp.headers['Content-encoding']
973 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
974 # https://github.com/rg3/youtube-dl/issues/6457).
975 if 300 <= resp.code < 400:
976 location = resp.headers.get('Location')
978 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
979 if sys.version_info >= (3, 0):
980 location = location.encode('iso-8859-1').decode('utf-8')
982 location = location.decode('utf-8')
983 location_escaped = escape_url(location)
984 if location != location_escaped:
985 del resp.headers['Location']
986 if sys.version_info < (3, 0):
987 location_escaped = location_escaped.encode('utf-8')
988 resp.headers['Location'] = location_escaped
991 https_request = http_request
992 https_response = http_response
995 def make_socks_conn_class(base_class, socks_proxy):
996 assert issubclass(base_class, (
997 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
999 url_components = compat_urlparse.urlparse(socks_proxy)
1000 if url_components.scheme.lower() == 'socks5':
1001 socks_type = ProxyType.SOCKS5
1002 elif url_components.scheme.lower() in ('socks', 'socks4'):
1003 socks_type = ProxyType.SOCKS4
1004 elif url_components.scheme.lower() == 'socks4a':
1005 socks_type = ProxyType.SOCKS4A
1007 def unquote_if_non_empty(s):
1010 return compat_urllib_parse_unquote_plus(s)
1014 url_components.hostname, url_components.port or 1080,
1016 unquote_if_non_empty(url_components.username),
1017 unquote_if_non_empty(url_components.password),
1020 class SocksConnection(base_class):
1022 self.sock = sockssocket()
1023 self.sock.setproxy(*proxy_args)
1024 if type(self.timeout) in (int, float):
1025 self.sock.settimeout(self.timeout)
1026 self.sock.connect((self.host, self.port))
1028 if isinstance(self, compat_http_client.HTTPSConnection):
1029 if hasattr(self, '_context'): # Python > 2.6
1030 self.sock = self._context.wrap_socket(
1031 self.sock, server_hostname=self.host)
1033 self.sock = ssl.wrap_socket(self.sock)
1035 return SocksConnection
1038 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1039 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1040 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1041 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1042 self._params = params
1044 def https_open(self, req):
1046 conn_class = self._https_conn_class
1048 if hasattr(self, '_context'): # python > 2.6
1049 kwargs['context'] = self._context
1050 if hasattr(self, '_check_hostname'): # python 3.x
1051 kwargs['check_hostname'] = self._check_hostname
1053 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1055 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1056 del req.headers['Ytdl-socks-proxy']
1058 return self.do_open(functools.partial(
1059 _create_http_connection, self, conn_class, True),
1063 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1064 def __init__(self, cookiejar=None):
1065 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1067 def http_response(self, request, response):
1068 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1069 # characters in Set-Cookie HTTP header of last response (see
1070 # https://github.com/rg3/youtube-dl/issues/6769).
1071 # In order to at least prevent crashing we will percent encode Set-Cookie
1072 # header before HTTPCookieProcessor starts processing it.
1073 # if sys.version_info < (3, 0) and response.headers:
1074 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1075 # set_cookie = response.headers.get(set_cookie_header)
1077 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1078 # if set_cookie != set_cookie_escaped:
1079 # del response.headers[set_cookie_header]
1080 # response.headers[set_cookie_header] = set_cookie_escaped
1081 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1083 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1084 https_response = http_response
1087 def extract_timezone(date_str):
1089 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1092 timezone = datetime.timedelta()
1094 date_str = date_str[:-len(m.group('tz'))]
1095 if not m.group('sign'):
1096 timezone = datetime.timedelta()
1098 sign = 1 if m.group('sign') == '+' else -1
1099 timezone = datetime.timedelta(
1100 hours=sign * int(m.group('hours')),
1101 minutes=sign * int(m.group('minutes')))
1102 return timezone, date_str
1105 def parse_iso8601(date_str, delimiter='T', timezone=None):
1106 """ Return a UNIX timestamp from the given date """
1108 if date_str is None:
1111 date_str = re.sub(r'\.[0-9]+', '', date_str)
1113 if timezone is None:
1114 timezone, date_str = extract_timezone(date_str)
1117 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1118 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1119 return calendar.timegm(dt.timetuple())
1124 def date_formats(day_first=True):
1125 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1128 def unified_strdate(date_str, day_first=True):
1129 """Return a string with the date in the format YYYYMMDD"""
1131 if date_str is None:
1135 date_str = date_str.replace(',', ' ')
1136 # Remove AM/PM + timezone
1137 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1138 _, date_str = extract_timezone(date_str)
1140 for expression in date_formats(day_first):
1142 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1145 if upload_date is None:
1146 timetuple = email.utils.parsedate_tz(date_str)
1149 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1152 if upload_date is not None:
1153 return compat_str(upload_date)
1156 def unified_timestamp(date_str, day_first=True):
1157 if date_str is None:
1160 date_str = date_str.replace(',', ' ')
1162 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1163 timezone, date_str = extract_timezone(date_str)
1165 # Remove AM/PM + timezone
1166 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1168 for expression in date_formats(day_first):
1170 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1171 return calendar.timegm(dt.timetuple())
1174 timetuple = email.utils.parsedate_tz(date_str)
1176 return calendar.timegm(timetuple) + pm_delta * 3600
1179 def determine_ext(url, default_ext='unknown_video'):
1182 guess = url.partition('?')[0].rpartition('.')[2]
1183 if re.match(r'^[A-Za-z0-9]+$', guess):
1185 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1186 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1187 return guess.rstrip('/')
1192 def subtitles_filename(filename, sub_lang, sub_format):
1193 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1196 def date_from_str(date_str):
1198 Return a datetime object from a string in the format YYYYMMDD or
1199 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1200 today = datetime.date.today()
1201 if date_str in ('now', 'today'):
1203 if date_str == 'yesterday':
1204 return today - datetime.timedelta(days=1)
1205 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1206 if match is not None:
1207 sign = match.group('sign')
1208 time = int(match.group('time'))
1211 unit = match.group('unit')
1212 # A bad approximation?
1216 elif unit == 'year':
1220 delta = datetime.timedelta(**{unit: time})
1221 return today + delta
1222 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1225 def hyphenate_date(date_str):
1227 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1228 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1229 if match is not None:
1230 return '-'.join(match.groups())
1235 class DateRange(object):
1236 """Represents a time interval between two dates"""
1238 def __init__(self, start=None, end=None):
1239 """start and end must be strings in the format accepted by date"""
1240 if start is not None:
1241 self.start = date_from_str(start)
1243 self.start = datetime.datetime.min.date()
1245 self.end = date_from_str(end)
1247 self.end = datetime.datetime.max.date()
1248 if self.start > self.end:
1249 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1253 """Returns a range that only contains the given day"""
1254 return cls(day, day)
1256 def __contains__(self, date):
1257 """Check if the date is in the range"""
1258 if not isinstance(date, datetime.date):
1259 date = date_from_str(date)
1260 return self.start <= date <= self.end
1263 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1266 def platform_name():
1267 """ Returns the platform name as a compat_str """
1268 res = platform.platform()
1269 if isinstance(res, bytes):
1270 res = res.decode(preferredencoding())
1272 assert isinstance(res, compat_str)
1276 def _windows_write_string(s, out):
1277 """ Returns True if the string was written using special methods,
1278 False if it has yet to be written out."""
1279 # Adapted from http://stackoverflow.com/a/3259271/35070
1282 import ctypes.wintypes
1290 fileno = out.fileno()
1291 except AttributeError:
1292 # If the output stream doesn't have a fileno, it's virtual
1294 except io.UnsupportedOperation:
1295 # Some strange Windows pseudo files?
1297 if fileno not in WIN_OUTPUT_IDS:
1300 GetStdHandle = ctypes.WINFUNCTYPE(
1301 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1302 (b'GetStdHandle', ctypes.windll.kernel32))
1303 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1305 WriteConsoleW = ctypes.WINFUNCTYPE(
1306 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1307 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1308 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1309 written = ctypes.wintypes.DWORD(0)
1311 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1312 FILE_TYPE_CHAR = 0x0002
1313 FILE_TYPE_REMOTE = 0x8000
1314 GetConsoleMode = ctypes.WINFUNCTYPE(
1315 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1316 ctypes.POINTER(ctypes.wintypes.DWORD))(
1317 (b'GetConsoleMode', ctypes.windll.kernel32))
1318 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1320 def not_a_console(handle):
1321 if handle == INVALID_HANDLE_VALUE or handle is None:
1323 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1324 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1326 if not_a_console(h):
1329 def next_nonbmp_pos(s):
1331 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1332 except StopIteration:
1336 count = min(next_nonbmp_pos(s), 1024)
1338 ret = WriteConsoleW(
1339 h, s, count if count else 2, ctypes.byref(written), None)
1341 raise OSError('Failed to write string')
1342 if not count: # We just wrote a non-BMP character
1343 assert written.value == 2
1346 assert written.value > 0
1347 s = s[written.value:]
1351 def write_string(s, out=None, encoding=None):
1354 assert type(s) == compat_str
1356 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1357 if _windows_write_string(s, out):
1360 if ('b' in getattr(out, 'mode', '') or
1361 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1362 byt = s.encode(encoding or preferredencoding(), 'ignore')
1364 elif hasattr(out, 'buffer'):
1365 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1366 byt = s.encode(enc, 'ignore')
1367 out.buffer.write(byt)
1373 def bytes_to_intlist(bs):
1376 if isinstance(bs[0], int): # Python 3
1379 return [ord(c) for c in bs]
1382 def intlist_to_bytes(xs):
1385 return compat_struct_pack('%dB' % len(xs), *xs)
1388 # Cross-platform file locking
1389 if sys.platform == 'win32':
1390 import ctypes.wintypes
1393 class OVERLAPPED(ctypes.Structure):
1395 ('Internal', ctypes.wintypes.LPVOID),
1396 ('InternalHigh', ctypes.wintypes.LPVOID),
1397 ('Offset', ctypes.wintypes.DWORD),
1398 ('OffsetHigh', ctypes.wintypes.DWORD),
1399 ('hEvent', ctypes.wintypes.HANDLE),
1402 kernel32 = ctypes.windll.kernel32
1403 LockFileEx = kernel32.LockFileEx
1404 LockFileEx.argtypes = [
1405 ctypes.wintypes.HANDLE, # hFile
1406 ctypes.wintypes.DWORD, # dwFlags
1407 ctypes.wintypes.DWORD, # dwReserved
1408 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1409 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1410 ctypes.POINTER(OVERLAPPED) # Overlapped
1412 LockFileEx.restype = ctypes.wintypes.BOOL
1413 UnlockFileEx = kernel32.UnlockFileEx
1414 UnlockFileEx.argtypes = [
1415 ctypes.wintypes.HANDLE, # hFile
1416 ctypes.wintypes.DWORD, # dwReserved
1417 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1418 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1419 ctypes.POINTER(OVERLAPPED) # Overlapped
1421 UnlockFileEx.restype = ctypes.wintypes.BOOL
1422 whole_low = 0xffffffff
1423 whole_high = 0x7fffffff
1425 def _lock_file(f, exclusive):
1426 overlapped = OVERLAPPED()
1427 overlapped.Offset = 0
1428 overlapped.OffsetHigh = 0
1429 overlapped.hEvent = 0
1430 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1431 handle = msvcrt.get_osfhandle(f.fileno())
1432 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1433 whole_low, whole_high, f._lock_file_overlapped_p):
1434 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1436 def _unlock_file(f):
1437 assert f._lock_file_overlapped_p
1438 handle = msvcrt.get_osfhandle(f.fileno())
1439 if not UnlockFileEx(handle, 0,
1440 whole_low, whole_high, f._lock_file_overlapped_p):
1441 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1444 # Some platforms, such as Jython, is missing fcntl
1448 def _lock_file(f, exclusive):
1449 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1451 def _unlock_file(f):
1452 fcntl.flock(f, fcntl.LOCK_UN)
1454 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1456 def _lock_file(f, exclusive):
1457 raise IOError(UNSUPPORTED_MSG)
1459 def _unlock_file(f):
1460 raise IOError(UNSUPPORTED_MSG)
1463 class locked_file(object):
1464 def __init__(self, filename, mode, encoding=None):
1465 assert mode in ['r', 'a', 'w']
1466 self.f = io.open(filename, mode, encoding=encoding)
1469 def __enter__(self):
1470 exclusive = self.mode != 'r'
1472 _lock_file(self.f, exclusive)
1478 def __exit__(self, etype, value, traceback):
1480 _unlock_file(self.f)
1487 def write(self, *args):
1488 return self.f.write(*args)
1490 def read(self, *args):
1491 return self.f.read(*args)
1494 def get_filesystem_encoding():
1495 encoding = sys.getfilesystemencoding()
1496 return encoding if encoding is not None else 'utf-8'
1499 def shell_quote(args):
1501 encoding = get_filesystem_encoding()
1503 if isinstance(a, bytes):
1504 # We may get a filename encoded with 'encodeFilename'
1505 a = a.decode(encoding)
1506 quoted_args.append(pipes.quote(a))
1507 return ' '.join(quoted_args)
1510 def smuggle_url(url, data):
1511 """ Pass additional data in a URL for internal use. """
1513 url, idata = unsmuggle_url(url, {})
1515 sdata = compat_urllib_parse_urlencode(
1516 {'__youtubedl_smuggle': json.dumps(data)})
1517 return url + '#' + sdata
1520 def unsmuggle_url(smug_url, default=None):
1521 if '#__youtubedl_smuggle' not in smug_url:
1522 return smug_url, default
1523 url, _, sdata = smug_url.rpartition('#')
1524 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1525 data = json.loads(jsond)
1529 def format_bytes(bytes):
1532 if type(bytes) is str:
1533 bytes = float(bytes)
1537 exponent = int(math.log(bytes, 1024.0))
1538 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1539 converted = float(bytes) / float(1024 ** exponent)
1540 return '%.2f%s' % (converted, suffix)
1543 def lookup_unit_table(unit_table, s):
1544 units_re = '|'.join(re.escape(u) for u in unit_table)
1546 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1549 num_str = m.group('num').replace(',', '.')
1550 mult = unit_table[m.group('unit')]
1551 return int(float(num_str) * mult)
1554 def parse_filesize(s):
1558 # The lower-case forms are of course incorrect and unofficial,
1559 # but we support those too
1576 'megabytes': 1000 ** 2,
1577 'mebibytes': 1024 ** 2,
1583 'gigabytes': 1000 ** 3,
1584 'gibibytes': 1024 ** 3,
1590 'terabytes': 1000 ** 4,
1591 'tebibytes': 1024 ** 4,
1597 'petabytes': 1000 ** 5,
1598 'pebibytes': 1024 ** 5,
1604 'exabytes': 1000 ** 6,
1605 'exbibytes': 1024 ** 6,
1611 'zettabytes': 1000 ** 7,
1612 'zebibytes': 1024 ** 7,
1618 'yottabytes': 1000 ** 8,
1619 'yobibytes': 1024 ** 8,
1622 return lookup_unit_table(_UNIT_TABLE, s)
1631 if re.match(r'^[\d,.]+$', s):
1632 return str_to_int(s)
1643 return lookup_unit_table(_UNIT_TABLE, s)
1646 def month_by_name(name, lang='en'):
1647 """ Return the number of a month by (locale-independently) English name """
1649 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1652 return month_names.index(name) + 1
1657 def month_by_abbreviation(abbrev):
1658 """ Return the number of a month by (locale-independently) English
1662 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1667 def fix_xml_ampersands(xml_str):
1668 """Replace all the '&' by '&' in XML"""
1670 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1675 def setproctitle(title):
1676 assert isinstance(title, compat_str)
1678 # ctypes in Jython is not complete
1679 # http://bugs.jython.org/issue2148
1680 if sys.platform.startswith('java'):
1684 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1688 # LoadLibrary in Windows Python 2.7.13 only expects
1689 # a bytestring, but since unicode_literals turns
1690 # every string into a unicode string, it fails.
1692 title_bytes = title.encode('utf-8')
1693 buf = ctypes.create_string_buffer(len(title_bytes))
1694 buf.value = title_bytes
1696 libc.prctl(15, buf, 0, 0, 0)
1697 except AttributeError:
1698 return # Strange libc, just skip this
1701 def remove_start(s, start):
1702 return s[len(start):] if s is not None and s.startswith(start) else s
1705 def remove_end(s, end):
1706 return s[:-len(end)] if s is not None and s.endswith(end) else s
1709 def remove_quotes(s):
1710 if s is None or len(s) < 2:
1712 for quote in ('"', "'", ):
1713 if s[0] == quote and s[-1] == quote:
1718 def url_basename(url):
1719 path = compat_urlparse.urlparse(url).path
1720 return path.strip('/').split('/')[-1]
1724 return re.match(r'https?://[^?#&]+/', url).group()
1727 def urljoin(base, path):
1728 if not isinstance(path, compat_str) or not path:
1730 if re.match(r'^(?:https?:)?//', path):
1732 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1734 return compat_urlparse.urljoin(base, path)
1737 class HEADRequest(compat_urllib_request.Request):
1738 def get_method(self):
1742 class PUTRequest(compat_urllib_request.Request):
1743 def get_method(self):
1747 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1750 v = getattr(v, get_attr, None)
1756 return int(v) * invscale // scale
1761 def str_or_none(v, default=None):
1762 return default if v is None else compat_str(v)
1765 def str_to_int(int_str):
1766 """ A more relaxed version of int_or_none """
1769 int_str = re.sub(r'[,\.\+]', '', int_str)
1773 def float_or_none(v, scale=1, invscale=1, default=None):
1777 return float(v) * invscale / scale
1782 def strip_or_none(v):
1783 return None if v is None else v.strip()
1786 def parse_duration(s):
1787 if not isinstance(s, compat_basestring):
1792 days, hours, mins, secs, ms = [None] * 5
1793 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1795 days, hours, mins, secs, ms = m.groups()
1800 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1803 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1806 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1809 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1812 days, hours, mins, secs, ms = m.groups()
1814 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1816 hours, mins = m.groups()
1822 duration += float(secs)
1824 duration += float(mins) * 60
1826 duration += float(hours) * 60 * 60
1828 duration += float(days) * 24 * 60 * 60
1830 duration += float(ms)
1834 def prepend_extension(filename, ext, expected_real_ext=None):
1835 name, real_ext = os.path.splitext(filename)
1837 '{0}.{1}{2}'.format(name, ext, real_ext)
1838 if not expected_real_ext or real_ext[1:] == expected_real_ext
1839 else '{0}.{1}'.format(filename, ext))
1842 def replace_extension(filename, ext, expected_real_ext=None):
1843 name, real_ext = os.path.splitext(filename)
1844 return '{0}.{1}'.format(
1845 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1849 def check_executable(exe, args=[]):
1850 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1851 args can be a list of arguments for a short output (like -version) """
1853 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1859 def get_exe_version(exe, args=['--version'],
1860 version_re=None, unrecognized='present'):
1861 """ Returns the version of the specified executable,
1862 or False if the executable is not present """
1864 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1865 # SIGTTOU if youtube-dl is run in the background.
1866 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1867 out, _ = subprocess.Popen(
1868 [encodeArgument(exe)] + args,
1869 stdin=subprocess.PIPE,
1870 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1873 if isinstance(out, bytes): # Python 2.x
1874 out = out.decode('ascii', 'ignore')
1875 return detect_exe_version(out, version_re, unrecognized)
1878 def detect_exe_version(output, version_re=None, unrecognized='present'):
1879 assert isinstance(output, compat_str)
1880 if version_re is None:
1881 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1882 m = re.search(version_re, output)
1889 class PagedList(object):
1891 # This is only useful for tests
1892 return len(self.getslice())
1895 class OnDemandPagedList(PagedList):
1896 def __init__(self, pagefunc, pagesize, use_cache=False):
1897 self._pagefunc = pagefunc
1898 self._pagesize = pagesize
1899 self._use_cache = use_cache
1903 def getslice(self, start=0, end=None):
1905 for pagenum in itertools.count(start // self._pagesize):
1906 firstid = pagenum * self._pagesize
1907 nextfirstid = pagenum * self._pagesize + self._pagesize
1908 if start >= nextfirstid:
1913 page_results = self._cache.get(pagenum)
1914 if page_results is None:
1915 page_results = list(self._pagefunc(pagenum))
1917 self._cache[pagenum] = page_results
1920 start % self._pagesize
1921 if firstid <= start < nextfirstid
1925 ((end - 1) % self._pagesize) + 1
1926 if (end is not None and firstid <= end <= nextfirstid)
1929 if startv != 0 or endv is not None:
1930 page_results = page_results[startv:endv]
1931 res.extend(page_results)
1933 # A little optimization - if current page is not "full", ie. does
1934 # not contain page_size videos then we can assume that this page
1935 # is the last one - there are no more ids on further pages -
1936 # i.e. no need to query again.
1937 if len(page_results) + startv < self._pagesize:
1940 # If we got the whole page, but the next page is not interesting,
1941 # break out early as well
1942 if end == nextfirstid:
1947 class InAdvancePagedList(PagedList):
1948 def __init__(self, pagefunc, pagecount, pagesize):
1949 self._pagefunc = pagefunc
1950 self._pagecount = pagecount
1951 self._pagesize = pagesize
1953 def getslice(self, start=0, end=None):
1955 start_page = start // self._pagesize
1957 self._pagecount if end is None else (end // self._pagesize + 1))
1958 skip_elems = start - start_page * self._pagesize
1959 only_more = None if end is None else end - start
1960 for pagenum in range(start_page, end_page):
1961 page = list(self._pagefunc(pagenum))
1963 page = page[skip_elems:]
1965 if only_more is not None:
1966 if len(page) < only_more:
1967 only_more -= len(page)
1969 page = page[:only_more]
1976 def uppercase_escape(s):
1977 unicode_escape = codecs.getdecoder('unicode_escape')
1979 r'\\U[0-9a-fA-F]{8}',
1980 lambda m: unicode_escape(m.group(0))[0],
1984 def lowercase_escape(s):
1985 unicode_escape = codecs.getdecoder('unicode_escape')
1987 r'\\u[0-9a-fA-F]{4}',
1988 lambda m: unicode_escape(m.group(0))[0],
1992 def escape_rfc3986(s):
1993 """Escape non-ASCII characters as suggested by RFC 3986"""
1994 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1995 s = s.encode('utf-8')
1996 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1999 def escape_url(url):
2000 """Escape URL as suggested by RFC 3986"""
2001 url_parsed = compat_urllib_parse_urlparse(url)
2002 return url_parsed._replace(
2003 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2004 path=escape_rfc3986(url_parsed.path),
2005 params=escape_rfc3986(url_parsed.params),
2006 query=escape_rfc3986(url_parsed.query),
2007 fragment=escape_rfc3986(url_parsed.fragment)
2011 def read_batch_urls(batch_fd):
2013 if not isinstance(url, compat_str):
2014 url = url.decode('utf-8', 'replace')
2015 BOM_UTF8 = '\xef\xbb\xbf'
2016 if url.startswith(BOM_UTF8):
2017 url = url[len(BOM_UTF8):]
2019 if url.startswith(('#', ';', ']')):
2023 with contextlib.closing(batch_fd) as fd:
2024 return [url for url in map(fixup, fd) if url]
2027 def urlencode_postdata(*args, **kargs):
2028 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2031 def update_url_query(url, query):
2034 parsed_url = compat_urlparse.urlparse(url)
2035 qs = compat_parse_qs(parsed_url.query)
2037 return compat_urlparse.urlunparse(parsed_url._replace(
2038 query=compat_urllib_parse_urlencode(qs, True)))
2041 def update_Request(req, url=None, data=None, headers={}, query={}):
2042 req_headers = req.headers.copy()
2043 req_headers.update(headers)
2044 req_data = data or req.data
2045 req_url = update_url_query(url or req.get_full_url(), query)
2046 req_get_method = req.get_method()
2047 if req_get_method == 'HEAD':
2048 req_type = HEADRequest
2049 elif req_get_method == 'PUT':
2050 req_type = PUTRequest
2052 req_type = compat_urllib_request.Request
2054 req_url, data=req_data, headers=req_headers,
2055 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2056 if hasattr(req, 'timeout'):
2057 new_req.timeout = req.timeout
2061 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2062 if isinstance(key_or_keys, (list, tuple)):
2063 for key in key_or_keys:
2064 if key not in d or d[key] is None or skip_false_values and not d[key]:
2068 return d.get(key_or_keys, default)
2071 def try_get(src, getter, expected_type=None):
2074 except (AttributeError, KeyError, TypeError, IndexError):
2077 if expected_type is None or isinstance(v, expected_type):
2081 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2082 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2094 TV_PARENTAL_GUIDELINES = {
2104 def parse_age_limit(s):
2106 return s if 0 <= s <= 21 else None
2107 if not isinstance(s, compat_basestring):
2109 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2111 return int(m.group('age'))
2113 return US_RATINGS[s]
2114 return TV_PARENTAL_GUIDELINES.get(s)
2117 def strip_jsonp(code):
2119 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2122 def js_to_json(code):
2123 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2124 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2126 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2127 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2132 if v in ('true', 'false', 'null'):
2134 elif v.startswith('/*') or v.startswith('//') or v == ',':
2137 if v[0] in ("'", '"'):
2138 v = re.sub(r'(?s)\\.|"', lambda m: {
2143 }.get(m.group(0), m.group(0)), v[1:-1])
2145 for regex, base in INTEGER_TABLE:
2146 im = re.match(regex, v)
2148 i = int(im.group(1), base)
2149 return '"%d":' % i if v.endswith(':') else '%d' % i
2153 return re.sub(r'''(?sx)
2154 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2155 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2156 {comment}|,(?={skip}[\]}}])|
2157 [a-zA-Z_][.a-zA-Z_0-9]*|
2158 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2160 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2163 def qualities(quality_ids):
2164 """ Get a numeric quality value out of a list of possible values """
2167 return quality_ids.index(qid)
2173 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2176 def limit_length(s, length):
2177 """ Add ellipses to overly long strings """
2182 return s[:length - len(ELLIPSES)] + ELLIPSES
2186 def version_tuple(v):
2187 return tuple(int(e) for e in re.split(r'[-.]', v))
2190 def is_outdated_version(version, limit, assume_new=True):
2192 return not assume_new
2194 return version_tuple(version) < version_tuple(limit)
2196 return not assume_new
2199 def ytdl_is_updateable():
2200 """ Returns if youtube-dl can be updated with -U """
2201 from zipimport import zipimporter
2203 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2206 def args_to_str(args):
2207 # Get a short string representation for a subprocess command
2208 return ' '.join(compat_shlex_quote(a) for a in args)
2211 def error_to_compat_str(err):
2213 # On python 2 error byte string must be decoded with proper
2214 # encoding rather than ascii
2215 if sys.version_info[0] < 3:
2216 err_str = err_str.decode(preferredencoding())
2220 def mimetype2ext(mt):
2226 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2227 # it's the most popular one
2228 'audio/mpeg': 'mp3',
2233 _, _, res = mt.rpartition('/')
2234 res = res.split(';')[0].strip().lower()
2238 'smptett+xml': 'tt',
2244 'x-mp4-fragmented': 'mp4',
2247 'x-mpegurl': 'm3u8',
2248 'vnd.apple.mpegurl': 'm3u8',
2253 'vnd.ms-sstr+xml': 'ism',
2258 def parse_codecs(codecs_str):
2259 # http://tools.ietf.org/html/rfc6381
2262 splited_codecs = list(filter(None, map(
2263 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2264 vcodec, acodec = None, None
2265 for full_codec in splited_codecs:
2266 codec = full_codec.split('.')[0]
2267 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2270 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2274 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2275 if not vcodec and not acodec:
2276 if len(splited_codecs) == 2:
2281 elif len(splited_codecs) == 1:
2288 'vcodec': vcodec or 'none',
2289 'acodec': acodec or 'none',
2294 def urlhandle_detect_ext(url_handle):
2295 getheader = url_handle.headers.get
2297 cd = getheader('Content-Disposition')
2299 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2301 e = determine_ext(m.group('filename'), default_ext=None)
2305 return mimetype2ext(getheader('Content-Type'))
2308 def encode_data_uri(data, mime_type):
2309 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2312 def age_restricted(content_limit, age_limit):
2313 """ Returns True iff the content should be blocked """
2315 if age_limit is None: # No limit set
2317 if content_limit is None:
2318 return False # Content available for everyone
2319 return age_limit < content_limit
2322 def is_html(first_bytes):
2323 """ Detect whether a file contains HTML by examining its first bytes. """
2326 (b'\xef\xbb\xbf', 'utf-8'),
2327 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2328 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2329 (b'\xff\xfe', 'utf-16-le'),
2330 (b'\xfe\xff', 'utf-16-be'),
2332 for bom, enc in BOMS:
2333 if first_bytes.startswith(bom):
2334 s = first_bytes[len(bom):].decode(enc, 'replace')
2337 s = first_bytes.decode('utf-8', 'replace')
2339 return re.match(r'^\s*<', s)
2342 def determine_protocol(info_dict):
2343 protocol = info_dict.get('protocol')
2344 if protocol is not None:
2347 url = info_dict['url']
2348 if url.startswith('rtmp'):
2350 elif url.startswith('mms'):
2352 elif url.startswith('rtsp'):
2355 ext = determine_ext(url)
2361 return compat_urllib_parse_urlparse(url).scheme
2364 def render_table(header_row, data):
2365 """ Render a list of rows, each as a list of values """
2366 table = [header_row] + data
2367 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2368 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2369 return '\n'.join(format_str % tuple(row) for row in table)
2372 def _match_one(filter_part, dct):
2373 COMPARISON_OPERATORS = {
2381 operator_rex = re.compile(r'''(?x)\s*
2383 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2385 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2386 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2389 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2390 m = operator_rex.search(filter_part)
2392 op = COMPARISON_OPERATORS[m.group('op')]
2393 actual_value = dct.get(m.group('key'))
2394 if (m.group('strval') is not None or
2395 # If the original field is a string and matching comparisonvalue is
2396 # a number we should respect the origin of the original field
2397 # and process comparison value as a string (see
2398 # https://github.com/rg3/youtube-dl/issues/11082).
2399 actual_value is not None and m.group('intval') is not None and
2400 isinstance(actual_value, compat_str)):
2401 if m.group('op') not in ('=', '!='):
2403 'Operator %s does not support string values!' % m.group('op'))
2404 comparison_value = m.group('strval') or m.group('intval')
2407 comparison_value = int(m.group('intval'))
2409 comparison_value = parse_filesize(m.group('intval'))
2410 if comparison_value is None:
2411 comparison_value = parse_filesize(m.group('intval') + 'B')
2412 if comparison_value is None:
2414 'Invalid integer value %r in filter part %r' % (
2415 m.group('intval'), filter_part))
2416 if actual_value is None:
2417 return m.group('none_inclusive')
2418 return op(actual_value, comparison_value)
2421 '': lambda v: v is not None,
2422 '!': lambda v: v is None,
2424 operator_rex = re.compile(r'''(?x)\s*
2425 (?P<op>%s)\s*(?P<key>[a-z_]+)
2427 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2428 m = operator_rex.search(filter_part)
2430 op = UNARY_OPERATORS[m.group('op')]
2431 actual_value = dct.get(m.group('key'))
2432 return op(actual_value)
2434 raise ValueError('Invalid filter part %r' % filter_part)
2437 def match_str(filter_str, dct):
2438 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2441 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2444 def match_filter_func(filter_str):
2445 def _match_func(info_dict):
2446 if match_str(filter_str, info_dict):
2449 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2450 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2454 def parse_dfxp_time_expr(time_expr):
2458 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2460 return float(mobj.group('time_offset'))
2462 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2464 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2467 def srt_subtitles_timecode(seconds):
2468 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2471 def dfxp2srt(dfxp_data):
2472 _x = functools.partial(xpath_with_ns, ns_map={
2473 'ttml': 'http://www.w3.org/ns/ttml',
2474 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2475 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2478 class TTMLPElementParser(object):
2481 def start(self, tag, attrib):
2482 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2488 def data(self, data):
2492 return self.out.strip()
2494 def parse_node(node):
2495 target = TTMLPElementParser()
2496 parser = xml.etree.ElementTree.XMLParser(target=target)
2497 parser.feed(xml.etree.ElementTree.tostring(node))
2498 return parser.close()
2500 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2502 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2505 raise ValueError('Invalid dfxp/TTML subtitle')
2507 for para, index in zip(paras, itertools.count(1)):
2508 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2509 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2510 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2511 if begin_time is None:
2516 end_time = begin_time + dur
2517 out.append('%d\n%s --> %s\n%s\n\n' % (
2519 srt_subtitles_timecode(begin_time),
2520 srt_subtitles_timecode(end_time),
2526 def cli_option(params, command_option, param):
2527 param = params.get(param)
2529 param = compat_str(param)
2530 return [command_option, param] if param is not None else []
2533 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2534 param = params.get(param)
2535 assert isinstance(param, bool)
2537 return [command_option + separator + (true_value if param else false_value)]
2538 return [command_option, true_value if param else false_value]
2541 def cli_valueless_option(params, command_option, param, expected_value=True):
2542 param = params.get(param)
2543 return [command_option] if param == expected_value else []
2546 def cli_configuration_args(params, param, default=[]):
2547 ex_args = params.get(param)
2550 assert isinstance(ex_args, list)
2554 class ISO639Utils(object):
2555 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2744 def short2long(cls, code):
2745 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2746 return cls._lang_map.get(code[:2])
2749 def long2short(cls, code):
2750 """Convert language code from ISO 639-2/T to ISO 639-1"""
2751 for short_name, long_name in cls._lang_map.items():
2752 if long_name == code:
2756 class ISO3166Utils(object):
2757 # From http://data.okfn.org/data/core/country-list
2759 'AF': 'Afghanistan',
2760 'AX': 'Åland Islands',
2763 'AS': 'American Samoa',
2768 'AG': 'Antigua and Barbuda',
2785 'BO': 'Bolivia, Plurinational State of',
2786 'BQ': 'Bonaire, Sint Eustatius and Saba',
2787 'BA': 'Bosnia and Herzegovina',
2789 'BV': 'Bouvet Island',
2791 'IO': 'British Indian Ocean Territory',
2792 'BN': 'Brunei Darussalam',
2794 'BF': 'Burkina Faso',
2800 'KY': 'Cayman Islands',
2801 'CF': 'Central African Republic',
2805 'CX': 'Christmas Island',
2806 'CC': 'Cocos (Keeling) Islands',
2810 'CD': 'Congo, the Democratic Republic of the',
2811 'CK': 'Cook Islands',
2813 'CI': 'Côte d\'Ivoire',
2818 'CZ': 'Czech Republic',
2822 'DO': 'Dominican Republic',
2825 'SV': 'El Salvador',
2826 'GQ': 'Equatorial Guinea',
2830 'FK': 'Falkland Islands (Malvinas)',
2831 'FO': 'Faroe Islands',
2835 'GF': 'French Guiana',
2836 'PF': 'French Polynesia',
2837 'TF': 'French Southern Territories',
2852 'GW': 'Guinea-Bissau',
2855 'HM': 'Heard Island and McDonald Islands',
2856 'VA': 'Holy See (Vatican City State)',
2863 'IR': 'Iran, Islamic Republic of',
2866 'IM': 'Isle of Man',
2876 'KP': 'Korea, Democratic People\'s Republic of',
2877 'KR': 'Korea, Republic of',
2880 'LA': 'Lao People\'s Democratic Republic',
2886 'LI': 'Liechtenstein',
2890 'MK': 'Macedonia, the Former Yugoslav Republic of',
2897 'MH': 'Marshall Islands',
2903 'FM': 'Micronesia, Federated States of',
2904 'MD': 'Moldova, Republic of',
2915 'NL': 'Netherlands',
2916 'NC': 'New Caledonia',
2917 'NZ': 'New Zealand',
2922 'NF': 'Norfolk Island',
2923 'MP': 'Northern Mariana Islands',
2928 'PS': 'Palestine, State of',
2930 'PG': 'Papua New Guinea',
2933 'PH': 'Philippines',
2937 'PR': 'Puerto Rico',
2941 'RU': 'Russian Federation',
2943 'BL': 'Saint Barthélemy',
2944 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2945 'KN': 'Saint Kitts and Nevis',
2946 'LC': 'Saint Lucia',
2947 'MF': 'Saint Martin (French part)',
2948 'PM': 'Saint Pierre and Miquelon',
2949 'VC': 'Saint Vincent and the Grenadines',
2952 'ST': 'Sao Tome and Principe',
2953 'SA': 'Saudi Arabia',
2957 'SL': 'Sierra Leone',
2959 'SX': 'Sint Maarten (Dutch part)',
2962 'SB': 'Solomon Islands',
2964 'ZA': 'South Africa',
2965 'GS': 'South Georgia and the South Sandwich Islands',
2966 'SS': 'South Sudan',
2971 'SJ': 'Svalbard and Jan Mayen',
2974 'CH': 'Switzerland',
2975 'SY': 'Syrian Arab Republic',
2976 'TW': 'Taiwan, Province of China',
2978 'TZ': 'Tanzania, United Republic of',
2980 'TL': 'Timor-Leste',
2984 'TT': 'Trinidad and Tobago',
2987 'TM': 'Turkmenistan',
2988 'TC': 'Turks and Caicos Islands',
2992 'AE': 'United Arab Emirates',
2993 'GB': 'United Kingdom',
2994 'US': 'United States',
2995 'UM': 'United States Minor Outlying Islands',
2999 'VE': 'Venezuela, Bolivarian Republic of',
3001 'VG': 'Virgin Islands, British',
3002 'VI': 'Virgin Islands, U.S.',
3003 'WF': 'Wallis and Futuna',
3004 'EH': 'Western Sahara',
3011 def short2full(cls, code):
3012 """Convert an ISO 3166-2 country code to the corresponding full name"""
3013 return cls._country_map.get(code.upper())
3016 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3017 def __init__(self, proxies=None):
3018 # Set default handlers
3019 for type in ('http', 'https'):
3020 setattr(self, '%s_open' % type,
3021 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3022 meth(r, proxy, type))
3023 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3025 def proxy_open(self, req, proxy, type):
3026 req_proxy = req.headers.get('Ytdl-request-proxy')
3027 if req_proxy is not None:
3029 del req.headers['Ytdl-request-proxy']
3031 if proxy == '__noproxy__':
3032 return None # No Proxy
3033 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3034 req.add_header('Ytdl-socks-proxy', proxy)
3035 # youtube-dl's http/https handlers do wrapping the socket with socks
3037 return compat_urllib_request.ProxyHandler.proxy_open(
3038 self, req, proxy, type)
3041 def ohdave_rsa_encrypt(data, exponent, modulus):
3043 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3046 data: data to encrypt, bytes-like object
3047 exponent, modulus: parameter e and N of RSA algorithm, both integer
3048 Output: hex string of encrypted data
3050 Limitation: supports one block encryption only
3053 payload = int(binascii.hexlify(data[::-1]), 16)
3054 encrypted = pow(payload, exponent, modulus)
3055 return '%x' % encrypted
3058 def encode_base_n(num, n, table=None):
3059 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3061 table = FULL_TABLE[:n]
3064 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3071 ret = table[num % n] + ret
3076 def decode_packed_codes(code):
3077 mobj = re.search(PACKED_CODES_RE, code)
3078 obfucasted_code, base, count, symbols = mobj.groups()
3081 symbols = symbols.split('|')
3086 base_n_count = encode_base_n(count, base)
3087 symbol_table[base_n_count] = symbols[count] or base_n_count
3090 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3094 def parse_m3u8_attributes(attrib):
3096 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3097 if val.startswith('"'):
3103 def urshift(val, n):
3104 return val >> n if val >= 0 else (val + 0x100000000) >> n
3107 # Based on png2str() written by @gdkchan and improved by @yokrysty
3108 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3109 def decode_png(png_data):
3110 # Reference: https://www.w3.org/TR/PNG/
3111 header = png_data[8:]
3113 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3114 raise IOError('Not a valid PNG file.')
3116 int_map = {1: '>B', 2: '>H', 4: '>I'}
3117 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3122 length = unpack_integer(header[:4])
3125 chunk_type = header[:4]
3128 chunk_data = header[:length]
3129 header = header[length:]
3131 header = header[4:] # Skip CRC
3139 ihdr = chunks[0]['data']
3141 width = unpack_integer(ihdr[:4])
3142 height = unpack_integer(ihdr[4:8])
3146 for chunk in chunks:
3147 if chunk['type'] == b'IDAT':
3148 idat += chunk['data']
3151 raise IOError('Unable to read PNG data.')
3153 decompressed_data = bytearray(zlib.decompress(idat))
3158 def _get_pixel(idx):
3163 for y in range(height):
3164 basePos = y * (1 + stride)
3165 filter_type = decompressed_data[basePos]
3169 pixels.append(current_row)
3171 for x in range(stride):
3172 color = decompressed_data[1 + basePos + x]
3173 basex = y * stride + x
3178 left = _get_pixel(basex - 3)
3180 up = _get_pixel(basex - stride)
3182 if filter_type == 1: # Sub
3183 color = (color + left) & 0xff
3184 elif filter_type == 2: # Up
3185 color = (color + up) & 0xff
3186 elif filter_type == 3: # Average
3187 color = (color + ((left + up) >> 1)) & 0xff
3188 elif filter_type == 4: # Paeth
3194 c = _get_pixel(basex - stride - 3)
3202 if pa <= pb and pa <= pc:
3203 color = (color + a) & 0xff
3205 color = (color + b) & 0xff
3207 color = (color + c) & 0xff
3209 current_row.append(color)
3211 return width, height, pixels
3214 def write_xattr(path, key, value):
3215 # This mess below finds the best xattr tool for the job
3217 # try the pyxattr module...
3220 if hasattr(xattr, 'set'): # pyxattr
3221 # Unicode arguments are not supported in python-pyxattr until
3223 # See https://github.com/rg3/youtube-dl/issues/5498
3224 pyxattr_required_version = '0.5.0'
3225 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3226 # TODO: fallback to CLI tools
3227 raise XAttrUnavailableError(
3228 'python-pyxattr is detected but is too old. '
3229 'youtube-dl requires %s or above while your version is %s. '
3230 'Falling back to other xattr implementations' % (
3231 pyxattr_required_version, xattr.__version__))
3233 setxattr = xattr.set
3235 setxattr = xattr.setxattr
3238 setxattr(path, key, value)
3239 except EnvironmentError as e:
3240 raise XAttrMetadataError(e.errno, e.strerror)
3243 if compat_os_name == 'nt':
3244 # Write xattrs to NTFS Alternate Data Streams:
3245 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3246 assert ':' not in key
3247 assert os.path.exists(path)
3249 ads_fn = path + ':' + key
3251 with open(ads_fn, 'wb') as f:
3253 except EnvironmentError as e:
3254 raise XAttrMetadataError(e.errno, e.strerror)
3256 user_has_setfattr = check_executable('setfattr', ['--version'])
3257 user_has_xattr = check_executable('xattr', ['-h'])
3259 if user_has_setfattr or user_has_xattr:
3261 value = value.decode('utf-8')
3262 if user_has_setfattr:
3263 executable = 'setfattr'
3264 opts = ['-n', key, '-v', value]
3265 elif user_has_xattr:
3266 executable = 'xattr'
3267 opts = ['-w', key, value]
3269 cmd = ([encodeFilename(executable, True)] +
3270 [encodeArgument(o) for o in opts] +
3271 [encodeFilename(path, True)])
3274 p = subprocess.Popen(
3275 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3276 except EnvironmentError as e:
3277 raise XAttrMetadataError(e.errno, e.strerror)
3278 stdout, stderr = p.communicate()
3279 stderr = stderr.decode('utf-8', 'replace')
3280 if p.returncode != 0:
3281 raise XAttrMetadataError(p.returncode, stderr)
3284 # On Unix, and can't find pyxattr, setfattr, or xattr.
3285 if sys.platform.startswith('linux'):
3286 raise XAttrUnavailableError(
3287 "Couldn't find a tool to set the xattrs. "
3288 "Install either the python 'pyxattr' or 'xattr' "
3289 "modules, or the GNU 'attr' package "
3290 "(which contains the 'setfattr' tool).")
3292 raise XAttrUnavailableError(
3293 "Couldn't find a tool to set the xattrs. "
3294 "Install either the python 'xattr' module, "
3295 "or the 'xattr' binary.")