4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
38 compat_HTMLParseError,
43 compat_ctypes_WINFUNCTYPE,
44 compat_etree_fromstring,
47 compat_html_entities_html5,
58 compat_urllib_parse_urlencode,
59 compat_urllib_parse_urlparse,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_request,
72 def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
81 # This is not clearly defined otherwise
82 compiled_regex_type = type(re.compile(''))
85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0',
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
100 ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
105 'en': ENGLISH_MONTH_NAMES,
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
124 'f4f', 'f4m', 'm3u8', 'smil')
126 # needed for sanitizing filenames in restricted mode
127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
152 '%Y-%m-%d %H:%M:%S.%f',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
159 '%Y-%m-%dT%H:%M:%S.%f',
162 '%b %d %Y at %H:%M:%S',
164 '%B %d %Y at %H:%M:%S',
167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168 DATE_FORMATS_DAY_FIRST.extend([
177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178 DATE_FORMATS_MONTH_FIRST.extend([
186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
187 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
190 def preferredencoding():
191 """Get preferred encoding.
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
197 pref = locale.getpreferredencoding()
205 def write_json_file(obj, fn):
206 """ Encode obj as JSON and write it to fn, atomically if possible """
208 fn = encodeFilename(fn)
209 if sys.version_info < (3, 0) and sys.platform != 'win32':
210 encoding = get_filesystem_encoding()
211 # os.path.basename returns a bytes object, but NamedTemporaryFile
212 # will fail if the filename contains non ascii characters unless we
213 # use a unicode object
214 path_basename = lambda f: os.path.basename(fn).decode(encoding)
215 # the same for os.path.dirname
216 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
218 path_basename = os.path.basename
219 path_dirname = os.path.dirname
223 'prefix': path_basename(fn) + '.',
224 'dir': path_dirname(fn),
228 # In Python 2.x, json.dump expects a bytestream.
229 # In Python 3.x, it writes to a character stream
230 if sys.version_info < (3, 0):
238 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
243 if sys.platform == 'win32':
244 # Need to remove existing file on Windows, else os.rename raises
245 # WindowsError or FileExistsError.
250 os.rename(tf.name, fn)
259 if sys.version_info >= (2, 7):
260 def find_xpath_attr(node, xpath, key, val=None):
261 """ Find the xpath xpath[@key=val] """
262 assert re.match(r'^[a-zA-Z_-]+$', key)
263 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
264 return node.find(expr)
266 def find_xpath_attr(node, xpath, key, val=None):
267 for f in node.findall(compat_xpath(xpath)):
268 if key not in f.attrib:
270 if val is None or f.attrib.get(key) == val:
274 # On python2.6 the xml.etree.ElementTree.Element methods don't support
275 # the namespace parameter
278 def xpath_with_ns(path, ns_map):
279 components = [c.split(':') for c in path.split('/')]
283 replaced.append(c[0])
286 replaced.append('{%s}%s' % (ns_map[ns], tag))
287 return '/'.join(replaced)
290 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
291 def _find_xpath(xpath):
292 return node.find(compat_xpath(xpath))
294 if isinstance(xpath, (str, compat_str)):
295 n = _find_xpath(xpath)
303 if default is not NO_DEFAULT:
306 name = xpath if name is None else name
307 raise ExtractorError('Could not find XML element %s' % name)
313 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
314 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
315 if n is None or n == default:
318 if default is not NO_DEFAULT:
321 name = xpath if name is None else name
322 raise ExtractorError('Could not find XML element\'s text %s' % name)
328 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
329 n = find_xpath_attr(node, xpath, key)
331 if default is not NO_DEFAULT:
334 name = '%s[@%s]' % (xpath, key) if name is None else name
335 raise ExtractorError('Could not find XML attribute %s' % name)
341 def get_element_by_id(id, html):
342 """Return the content of the tag with the specified ID in the passed HTML document"""
343 return get_element_by_attribute('id', id, html)
346 def get_element_by_class(class_name, html):
347 """Return the content of the first tag with the specified class in the passed HTML document"""
348 retval = get_elements_by_class(class_name, html)
349 return retval[0] if retval else None
352 def get_element_by_attribute(attribute, value, html, escape_value=True):
353 retval = get_elements_by_attribute(attribute, value, html, escape_value)
354 return retval[0] if retval else None
357 def get_elements_by_class(class_name, html):
358 """Return the content of all tags with the specified class in the passed HTML document as a list"""
359 return get_elements_by_attribute(
360 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
361 html, escape_value=False)
364 def get_elements_by_attribute(attribute, value, html, escape_value=True):
365 """Return the content of the tag with the specified attribute in the passed HTML document"""
367 value = re.escape(value) if escape_value else value
370 for m in re.finditer(r'''(?xs)
372 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
374 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
378 ''' % (re.escape(attribute), value), html):
379 res = m.group('content')
381 if res.startswith('"') or res.startswith("'"):
384 retlist.append(unescapeHTML(res))
389 class HTMLAttributeParser(compat_HTMLParser):
390 """Trivial HTML parser to gather the attributes for a single element"""
393 compat_HTMLParser.__init__(self)
395 def handle_starttag(self, tag, attrs):
396 self.attrs = dict(attrs)
399 def extract_attributes(html_element):
400 """Given a string for an HTML element such as
402 a="foo" B="bar" c="&98;az" d=boz
403 empty= noval entity="&"
406 Decode and return a dictionary of attributes.
408 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
409 'empty': '', 'noval': None, 'entity': '&',
410 'sq': '"', 'dq': '\''
412 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
413 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
415 parser = HTMLAttributeParser()
417 parser.feed(html_element)
419 # Older Python may throw HTMLParseError in case of malformed HTML
420 except compat_HTMLParseError:
425 def clean_html(html):
426 """Clean an HTML snippet into a readable string"""
428 if html is None: # Convenience for sanitizing descriptions etc.
432 html = html.replace('\n', ' ')
433 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
434 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
436 html = re.sub('<.*?>', '', html)
437 # Replace html entities
438 html = unescapeHTML(html)
442 def sanitize_open(filename, open_mode):
443 """Try to open the given filename, and slightly tweak it if this fails.
445 Attempts to open the given filename. If this fails, it tries to change
446 the filename slightly, step by step, until it's either able to open it
447 or it fails and raises a final exception, like the standard open()
450 It returns the tuple (stream, definitive_file_name).
454 if sys.platform == 'win32':
456 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
457 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
458 stream = open(encodeFilename(filename), open_mode)
459 return (stream, filename)
460 except (IOError, OSError) as err:
461 if err.errno in (errno.EACCES,):
464 # In case of error, try to remove win32 forbidden chars
465 alt_filename = sanitize_path(filename)
466 if alt_filename == filename:
469 # An exception here should be caught in the caller
470 stream = open(encodeFilename(alt_filename), open_mode)
471 return (stream, alt_filename)
474 def timeconvert(timestr):
475 """Convert RFC 2822 defined time string into system timestamp"""
477 timetuple = email.utils.parsedate_tz(timestr)
478 if timetuple is not None:
479 timestamp = email.utils.mktime_tz(timetuple)
483 def sanitize_filename(s, restricted=False, is_id=False):
484 """Sanitizes a string so it could be used as part of a filename.
485 If restricted is set, use a stricter subset of allowed characters.
486 Set is_id if this is not an arbitrary string, but an ID that should be kept
489 def replace_insane(char):
490 if restricted and char in ACCENT_CHARS:
491 return ACCENT_CHARS[char]
492 if char == '?' or ord(char) < 32 or ord(char) == 127:
495 return '' if restricted else '\''
497 return '_-' if restricted else ' -'
498 elif char in '\\/|*<>':
500 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
502 if restricted and ord(char) > 127:
507 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
508 result = ''.join(map(replace_insane, s))
510 while '__' in result:
511 result = result.replace('__', '_')
512 result = result.strip('_')
513 # Common case of "Foreign band name - English song title"
514 if restricted and result.startswith('-_'):
516 if result.startswith('-'):
517 result = '_' + result[len('-'):]
518 result = result.lstrip('.')
524 def sanitize_path(s):
525 """Sanitizes and normalizes path on Windows"""
526 if sys.platform != 'win32':
528 drive_or_unc, _ = os.path.splitdrive(s)
529 if sys.version_info < (2, 7) and not drive_or_unc:
530 drive_or_unc, _ = os.path.splitunc(s)
531 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
535 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
536 for path_part in norm_path]
538 sanitized_path.insert(0, drive_or_unc + os.path.sep)
539 return os.path.join(*sanitized_path)
542 def sanitize_url(url):
543 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
544 # the number of unwanted failures due to missing protocol
545 if url.startswith('//'):
546 return 'http:%s' % url
547 # Fix some common typos seen so far
549 # https://github.com/rg3/youtube-dl/issues/15649
550 (r'^httpss://', r'https://'),
551 # https://bx1.be/lives/direct-tv/
552 (r'^rmtp([es]?)://', r'rtmp\1://'),
554 for mistake, fixup in COMMON_TYPOS:
555 if re.match(mistake, url):
556 return re.sub(mistake, fixup, url)
560 def sanitized_Request(url, *args, **kwargs):
561 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
565 """Expand shell variables and ~"""
566 return os.path.expandvars(compat_expanduser(s))
569 def orderedSet(iterable):
570 """ Remove all duplicates from the input iterable """
578 def _htmlentity_transform(entity_with_semicolon):
579 """Transforms an HTML entity to a character."""
580 entity = entity_with_semicolon[:-1]
582 # Known non-numeric HTML entity
583 if entity in compat_html_entities.name2codepoint:
584 return compat_chr(compat_html_entities.name2codepoint[entity])
586 # TODO: HTML5 allows entities without a semicolon. For example,
587 # 'Éric' should be decoded as 'Éric'.
588 if entity_with_semicolon in compat_html_entities_html5:
589 return compat_html_entities_html5[entity_with_semicolon]
591 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
593 numstr = mobj.group(1)
594 if numstr.startswith('x'):
596 numstr = '0%s' % numstr
599 # See https://github.com/rg3/youtube-dl/issues/7518
601 return compat_chr(int(numstr, base))
605 # Unknown entity in name, return its literal representation
606 return '&%s;' % entity
612 assert type(s) == compat_str
615 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
618 def get_subprocess_encoding():
619 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
620 # For subprocess calls, encode with locale encoding
621 # Refer to http://stackoverflow.com/a/9951851/35070
622 encoding = preferredencoding()
624 encoding = sys.getfilesystemencoding()
630 def encodeFilename(s, for_subprocess=False):
632 @param s The name of the file
635 assert type(s) == compat_str
637 # Python 3 has a Unicode API
638 if sys.version_info >= (3, 0):
641 # Pass '' directly to use Unicode APIs on Windows 2000 and up
642 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
643 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
644 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
647 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
648 if sys.platform.startswith('java'):
651 return s.encode(get_subprocess_encoding(), 'ignore')
654 def decodeFilename(b, for_subprocess=False):
656 if sys.version_info >= (3, 0):
659 if not isinstance(b, bytes):
662 return b.decode(get_subprocess_encoding(), 'ignore')
665 def encodeArgument(s):
666 if not isinstance(s, compat_str):
667 # Legacy code that uses byte strings
668 # Uncomment the following line after fixing all post processors
669 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
670 s = s.decode('ascii')
671 return encodeFilename(s, True)
674 def decodeArgument(b):
675 return decodeFilename(b, True)
678 def decodeOption(optval):
681 if isinstance(optval, bytes):
682 optval = optval.decode(preferredencoding())
684 assert isinstance(optval, compat_str)
688 def formatSeconds(secs):
690 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
692 return '%d:%02d' % (secs // 60, secs % 60)
697 def make_HTTPS_handler(params, **kwargs):
698 opts_no_check_certificate = params.get('nocheckcertificate', False)
699 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
700 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
701 if opts_no_check_certificate:
702 context.check_hostname = False
703 context.verify_mode = ssl.CERT_NONE
705 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
708 # (create_default_context present but HTTPSHandler has no context=)
711 if sys.version_info < (3, 2):
712 return YoutubeDLHTTPSHandler(params, **kwargs)
714 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
715 context.verify_mode = (ssl.CERT_NONE
716 if opts_no_check_certificate
717 else ssl.CERT_REQUIRED)
718 context.set_default_verify_paths()
719 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
722 def bug_reports_message():
723 if ytdl_is_updateable():
724 update_cmd = 'type youtube-dl -U to update'
726 update_cmd = 'see https://yt-dl.org/update on how to update'
727 msg = '; please report this issue on https://yt-dl.org/bug .'
728 msg += ' Make sure you are using the latest version; %s.' % update_cmd
729 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
733 class YoutubeDLError(Exception):
734 """Base exception for YoutubeDL errors."""
738 class ExtractorError(YoutubeDLError):
739 """Error during info extraction."""
741 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
742 """ tb, if given, is the original traceback (so that it can be printed out).
743 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
746 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
748 if video_id is not None:
749 msg = video_id + ': ' + msg
751 msg += ' (caused by %r)' % cause
753 msg += bug_reports_message()
754 super(ExtractorError, self).__init__(msg)
757 self.exc_info = sys.exc_info() # preserve original exception
759 self.video_id = video_id
761 def format_traceback(self):
762 if self.traceback is None:
764 return ''.join(traceback.format_tb(self.traceback))
767 class UnsupportedError(ExtractorError):
768 def __init__(self, url):
769 super(UnsupportedError, self).__init__(
770 'Unsupported URL: %s' % url, expected=True)
774 class RegexNotFoundError(ExtractorError):
775 """Error when a regex didn't match"""
779 class GeoRestrictedError(ExtractorError):
780 """Geographic restriction Error exception.
782 This exception may be thrown when a video is not available from your
783 geographic location due to geographic restrictions imposed by a website.
785 def __init__(self, msg, countries=None):
786 super(GeoRestrictedError, self).__init__(msg, expected=True)
788 self.countries = countries
791 class DownloadError(YoutubeDLError):
792 """Download Error exception.
794 This exception may be thrown by FileDownloader objects if they are not
795 configured to continue on errors. They will contain the appropriate
799 def __init__(self, msg, exc_info=None):
800 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
801 super(DownloadError, self).__init__(msg)
802 self.exc_info = exc_info
805 class SameFileError(YoutubeDLError):
806 """Same File exception.
808 This exception will be thrown by FileDownloader objects if they detect
809 multiple files would have to be downloaded to the same file on disk.
814 class PostProcessingError(YoutubeDLError):
815 """Post Processing exception.
817 This exception may be raised by PostProcessor's .run() method to
818 indicate an error in the postprocessing task.
821 def __init__(self, msg):
822 super(PostProcessingError, self).__init__(msg)
826 class MaxDownloadsReached(YoutubeDLError):
827 """ --max-downloads limit has been reached. """
831 class UnavailableVideoError(YoutubeDLError):
832 """Unavailable Format exception.
834 This exception will be thrown when a video is requested
835 in a format that is not available for that video.
840 class ContentTooShortError(YoutubeDLError):
841 """Content Too Short exception.
843 This exception may be raised by FileDownloader objects when a file they
844 download is too small for what the server announced first, indicating
845 the connection was probably interrupted.
848 def __init__(self, downloaded, expected):
849 super(ContentTooShortError, self).__init__(
850 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
853 self.downloaded = downloaded
854 self.expected = expected
857 class XAttrMetadataError(YoutubeDLError):
858 def __init__(self, code=None, msg='Unknown error'):
859 super(XAttrMetadataError, self).__init__(msg)
863 # Parsing code and msg
864 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
865 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
866 self.reason = 'NO_SPACE'
867 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
868 self.reason = 'VALUE_TOO_LONG'
870 self.reason = 'NOT_SUPPORTED'
873 class XAttrUnavailableError(YoutubeDLError):
877 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
878 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
879 # expected HTTP responses to meet HTTP/1.0 or later (see also
880 # https://github.com/rg3/youtube-dl/issues/6727)
881 if sys.version_info < (3, 0):
882 kwargs['strict'] = True
883 hc = http_class(*args, **compat_kwargs(kwargs))
884 source_address = ydl_handler._params.get('source_address')
886 if source_address is not None:
887 # This is to workaround _create_connection() from socket where it will try all
888 # address data from getaddrinfo() including IPv6. This filters the result from
889 # getaddrinfo() based on the source_address value.
890 # This is based on the cpython socket.create_connection() function.
891 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
892 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
895 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
896 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
897 ip_addrs = [addr for addr in addrs if addr[0] == af]
898 if addrs and not ip_addrs:
899 ip_version = 'v4' if af == socket.AF_INET else 'v6'
901 "No remote IP%s addresses available for connect, can't use '%s' as source address"
902 % (ip_version, source_address[0]))
904 af, socktype, proto, canonname, sa = res
907 sock = socket.socket(af, socktype, proto)
908 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
909 sock.settimeout(timeout)
910 sock.bind(source_address)
912 err = None # Explicitly break reference cycle
914 except socket.error as _:
921 raise socket.error('getaddrinfo returns an empty list')
922 if hasattr(hc, '_create_connection'):
923 hc._create_connection = _create_connection
924 sa = (source_address, 0)
925 if hasattr(hc, 'source_address'): # Python 2.7+
926 hc.source_address = sa
928 def _hc_connect(self, *args, **kwargs):
929 sock = _create_connection(
930 (self.host, self.port), self.timeout, sa)
932 self.sock = ssl.wrap_socket(
933 sock, self.key_file, self.cert_file,
934 ssl_version=ssl.PROTOCOL_TLSv1)
937 hc.connect = functools.partial(_hc_connect, hc)
942 def handle_youtubedl_headers(headers):
943 filtered_headers = headers
945 if 'Youtubedl-no-compression' in filtered_headers:
946 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
947 del filtered_headers['Youtubedl-no-compression']
949 return filtered_headers
952 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
953 """Handler for HTTP requests and responses.
955 This class, when installed with an OpenerDirector, automatically adds
956 the standard headers to every HTTP request and handles gzipped and
957 deflated responses from web servers. If compression is to be avoided in
958 a particular request, the original request in the program code only has
959 to include the HTTP header "Youtubedl-no-compression", which will be
960 removed before making the real request.
962 Part of this code was copied from:
964 http://techknack.net/python-urllib2-handlers/
966 Andrew Rowls, the author of that code, agreed to release it to the
970 def __init__(self, params, *args, **kwargs):
971 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
972 self._params = params
974 def http_open(self, req):
975 conn_class = compat_http_client.HTTPConnection
977 socks_proxy = req.headers.get('Ytdl-socks-proxy')
979 conn_class = make_socks_conn_class(conn_class, socks_proxy)
980 del req.headers['Ytdl-socks-proxy']
982 return self.do_open(functools.partial(
983 _create_http_connection, self, conn_class, False),
989 return zlib.decompress(data, -zlib.MAX_WBITS)
991 return zlib.decompress(data)
993 def http_request(self, req):
994 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
995 # always respected by websites, some tend to give out URLs with non percent-encoded
996 # non-ASCII characters (see telemb.py, ard.py [#3412])
997 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
998 # To work around aforementioned issue we will replace request's original URL with
999 # percent-encoded one
1000 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1001 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1002 url = req.get_full_url()
1003 url_escaped = escape_url(url)
1005 # Substitute URL if any change after escaping
1006 if url != url_escaped:
1007 req = update_Request(req, url=url_escaped)
1009 for h, v in std_headers.items():
1010 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1011 # The dict keys are capitalized because of this bug by urllib
1012 if h.capitalize() not in req.headers:
1013 req.add_header(h, v)
1015 req.headers = handle_youtubedl_headers(req.headers)
1017 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1018 # Python 2.6 is brain-dead when it comes to fragments
1019 req._Request__original = req._Request__original.partition('#')[0]
1020 req._Request__r_type = req._Request__r_type.partition('#')[0]
1024 def http_response(self, req, resp):
1027 if resp.headers.get('Content-encoding', '') == 'gzip':
1028 content = resp.read()
1029 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1031 uncompressed = io.BytesIO(gz.read())
1032 except IOError as original_ioerror:
1033 # There may be junk add the end of the file
1034 # See http://stackoverflow.com/q/4928560/35070 for details
1035 for i in range(1, 1024):
1037 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1038 uncompressed = io.BytesIO(gz.read())
1043 raise original_ioerror
1044 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1045 resp.msg = old_resp.msg
1046 del resp.headers['Content-encoding']
1048 if resp.headers.get('Content-encoding', '') == 'deflate':
1049 gz = io.BytesIO(self.deflate(resp.read()))
1050 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1051 resp.msg = old_resp.msg
1052 del resp.headers['Content-encoding']
1053 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1054 # https://github.com/rg3/youtube-dl/issues/6457).
1055 if 300 <= resp.code < 400:
1056 location = resp.headers.get('Location')
1058 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1059 if sys.version_info >= (3, 0):
1060 location = location.encode('iso-8859-1').decode('utf-8')
1062 location = location.decode('utf-8')
1063 location_escaped = escape_url(location)
1064 if location != location_escaped:
1065 del resp.headers['Location']
1066 if sys.version_info < (3, 0):
1067 location_escaped = location_escaped.encode('utf-8')
1068 resp.headers['Location'] = location_escaped
1071 https_request = http_request
1072 https_response = http_response
1075 def make_socks_conn_class(base_class, socks_proxy):
1076 assert issubclass(base_class, (
1077 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1079 url_components = compat_urlparse.urlparse(socks_proxy)
1080 if url_components.scheme.lower() == 'socks5':
1081 socks_type = ProxyType.SOCKS5
1082 elif url_components.scheme.lower() in ('socks', 'socks4'):
1083 socks_type = ProxyType.SOCKS4
1084 elif url_components.scheme.lower() == 'socks4a':
1085 socks_type = ProxyType.SOCKS4A
1087 def unquote_if_non_empty(s):
1090 return compat_urllib_parse_unquote_plus(s)
1094 url_components.hostname, url_components.port or 1080,
1096 unquote_if_non_empty(url_components.username),
1097 unquote_if_non_empty(url_components.password),
1100 class SocksConnection(base_class):
1102 self.sock = sockssocket()
1103 self.sock.setproxy(*proxy_args)
1104 if type(self.timeout) in (int, float):
1105 self.sock.settimeout(self.timeout)
1106 self.sock.connect((self.host, self.port))
1108 if isinstance(self, compat_http_client.HTTPSConnection):
1109 if hasattr(self, '_context'): # Python > 2.6
1110 self.sock = self._context.wrap_socket(
1111 self.sock, server_hostname=self.host)
1113 self.sock = ssl.wrap_socket(self.sock)
1115 return SocksConnection
1118 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1119 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1120 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1121 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1122 self._params = params
1124 def https_open(self, req):
1126 conn_class = self._https_conn_class
1128 if hasattr(self, '_context'): # python > 2.6
1129 kwargs['context'] = self._context
1130 if hasattr(self, '_check_hostname'): # python 3.x
1131 kwargs['check_hostname'] = self._check_hostname
1133 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1135 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1136 del req.headers['Ytdl-socks-proxy']
1138 return self.do_open(functools.partial(
1139 _create_http_connection, self, conn_class, True),
1143 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1144 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1145 # Store session cookies with `expires` set to 0 instead of an empty
1148 if cookie.expires is None:
1150 compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
1152 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1153 compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires)
1154 # Session cookies are denoted by either `expires` field set to
1155 # an empty string or 0. MozillaCookieJar only recognizes the former
1156 # (see [1]). So we need force the latter to be recognized as session
1157 # cookies on our own.
1158 # Session cookies may be important for cookies-based authentication,
1159 # e.g. usually, when user does not check 'Remember me' check box while
1160 # logging in on a site, some important cookies are stored as session
1161 # cookies so that not recognizing them will result in failed login.
1162 # 1. https://bugs.python.org/issue17164
1164 # Treat `expires=0` cookies as session cookies
1165 if cookie.expires == 0:
1166 cookie.expires = None
1167 cookie.discard = True
1170 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1171 def __init__(self, cookiejar=None):
1172 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1174 def http_response(self, request, response):
1175 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1176 # characters in Set-Cookie HTTP header of last response (see
1177 # https://github.com/rg3/youtube-dl/issues/6769).
1178 # In order to at least prevent crashing we will percent encode Set-Cookie
1179 # header before HTTPCookieProcessor starts processing it.
1180 # if sys.version_info < (3, 0) and response.headers:
1181 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1182 # set_cookie = response.headers.get(set_cookie_header)
1184 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1185 # if set_cookie != set_cookie_escaped:
1186 # del response.headers[set_cookie_header]
1187 # response.headers[set_cookie_header] = set_cookie_escaped
1188 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1190 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1191 https_response = http_response
1194 def extract_timezone(date_str):
1196 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1199 timezone = datetime.timedelta()
1201 date_str = date_str[:-len(m.group('tz'))]
1202 if not m.group('sign'):
1203 timezone = datetime.timedelta()
1205 sign = 1 if m.group('sign') == '+' else -1
1206 timezone = datetime.timedelta(
1207 hours=sign * int(m.group('hours')),
1208 minutes=sign * int(m.group('minutes')))
1209 return timezone, date_str
1212 def parse_iso8601(date_str, delimiter='T', timezone=None):
1213 """ Return a UNIX timestamp from the given date """
1215 if date_str is None:
1218 date_str = re.sub(r'\.[0-9]+', '', date_str)
1220 if timezone is None:
1221 timezone, date_str = extract_timezone(date_str)
1224 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1225 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1226 return calendar.timegm(dt.timetuple())
1231 def date_formats(day_first=True):
1232 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1235 def unified_strdate(date_str, day_first=True):
1236 """Return a string with the date in the format YYYYMMDD"""
1238 if date_str is None:
1242 date_str = date_str.replace(',', ' ')
1243 # Remove AM/PM + timezone
1244 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1245 _, date_str = extract_timezone(date_str)
1247 for expression in date_formats(day_first):
1249 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1252 if upload_date is None:
1253 timetuple = email.utils.parsedate_tz(date_str)
1256 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1259 if upload_date is not None:
1260 return compat_str(upload_date)
1263 def unified_timestamp(date_str, day_first=True):
1264 if date_str is None:
1267 date_str = re.sub(r'[,|]', '', date_str)
1269 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1270 timezone, date_str = extract_timezone(date_str)
1272 # Remove AM/PM + timezone
1273 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1275 # Remove unrecognized timezones from ISO 8601 alike timestamps
1276 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1278 date_str = date_str[:-len(m.group('tz'))]
1280 # Python only supports microseconds, so remove nanoseconds
1281 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1283 date_str = m.group(1)
1285 for expression in date_formats(day_first):
1287 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1288 return calendar.timegm(dt.timetuple())
1291 timetuple = email.utils.parsedate_tz(date_str)
1293 return calendar.timegm(timetuple) + pm_delta * 3600
1296 def determine_ext(url, default_ext='unknown_video'):
1297 if url is None or '.' not in url:
1299 guess = url.partition('?')[0].rpartition('.')[2]
1300 if re.match(r'^[A-Za-z0-9]+$', guess):
1302 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1303 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1304 return guess.rstrip('/')
1309 def subtitles_filename(filename, sub_lang, sub_format):
1310 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1313 def date_from_str(date_str):
1315 Return a datetime object from a string in the format YYYYMMDD or
1316 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1317 today = datetime.date.today()
1318 if date_str in ('now', 'today'):
1320 if date_str == 'yesterday':
1321 return today - datetime.timedelta(days=1)
1322 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1323 if match is not None:
1324 sign = match.group('sign')
1325 time = int(match.group('time'))
1328 unit = match.group('unit')
1329 # A bad approximation?
1333 elif unit == 'year':
1337 delta = datetime.timedelta(**{unit: time})
1338 return today + delta
1339 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1342 def hyphenate_date(date_str):
1344 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346 if match is not None:
1347 return '-'.join(match.groups())
1352 class DateRange(object):
1353 """Represents a time interval between two dates"""
1355 def __init__(self, start=None, end=None):
1356 """start and end must be strings in the format accepted by date"""
1357 if start is not None:
1358 self.start = date_from_str(start)
1360 self.start = datetime.datetime.min.date()
1362 self.end = date_from_str(end)
1364 self.end = datetime.datetime.max.date()
1365 if self.start > self.end:
1366 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1370 """Returns a range that only contains the given day"""
1371 return cls(day, day)
1373 def __contains__(self, date):
1374 """Check if the date is in the range"""
1375 if not isinstance(date, datetime.date):
1376 date = date_from_str(date)
1377 return self.start <= date <= self.end
1380 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1383 def platform_name():
1384 """ Returns the platform name as a compat_str """
1385 res = platform.platform()
1386 if isinstance(res, bytes):
1387 res = res.decode(preferredencoding())
1389 assert isinstance(res, compat_str)
1393 def _windows_write_string(s, out):
1394 """ Returns True if the string was written using special methods,
1395 False if it has yet to be written out."""
1396 # Adapted from http://stackoverflow.com/a/3259271/35070
1399 import ctypes.wintypes
1407 fileno = out.fileno()
1408 except AttributeError:
1409 # If the output stream doesn't have a fileno, it's virtual
1411 except io.UnsupportedOperation:
1412 # Some strange Windows pseudo files?
1414 if fileno not in WIN_OUTPUT_IDS:
1417 GetStdHandle = compat_ctypes_WINFUNCTYPE(
1418 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1419 ('GetStdHandle', ctypes.windll.kernel32))
1420 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1422 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1423 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1424 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1425 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1426 written = ctypes.wintypes.DWORD(0)
1428 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1429 FILE_TYPE_CHAR = 0x0002
1430 FILE_TYPE_REMOTE = 0x8000
1431 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1432 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1433 ctypes.POINTER(ctypes.wintypes.DWORD))(
1434 ('GetConsoleMode', ctypes.windll.kernel32))
1435 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1437 def not_a_console(handle):
1438 if handle == INVALID_HANDLE_VALUE or handle is None:
1440 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1441 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1443 if not_a_console(h):
1446 def next_nonbmp_pos(s):
1448 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1449 except StopIteration:
1453 count = min(next_nonbmp_pos(s), 1024)
1455 ret = WriteConsoleW(
1456 h, s, count if count else 2, ctypes.byref(written), None)
1458 raise OSError('Failed to write string')
1459 if not count: # We just wrote a non-BMP character
1460 assert written.value == 2
1463 assert written.value > 0
1464 s = s[written.value:]
1468 def write_string(s, out=None, encoding=None):
1471 assert type(s) == compat_str
1473 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1474 if _windows_write_string(s, out):
1477 if ('b' in getattr(out, 'mode', '') or
1478 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1479 byt = s.encode(encoding or preferredencoding(), 'ignore')
1481 elif hasattr(out, 'buffer'):
1482 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1483 byt = s.encode(enc, 'ignore')
1484 out.buffer.write(byt)
1490 def bytes_to_intlist(bs):
1493 if isinstance(bs[0], int): # Python 3
1496 return [ord(c) for c in bs]
1499 def intlist_to_bytes(xs):
1502 return compat_struct_pack('%dB' % len(xs), *xs)
1505 # Cross-platform file locking
1506 if sys.platform == 'win32':
1507 import ctypes.wintypes
1510 class OVERLAPPED(ctypes.Structure):
1512 ('Internal', ctypes.wintypes.LPVOID),
1513 ('InternalHigh', ctypes.wintypes.LPVOID),
1514 ('Offset', ctypes.wintypes.DWORD),
1515 ('OffsetHigh', ctypes.wintypes.DWORD),
1516 ('hEvent', ctypes.wintypes.HANDLE),
1519 kernel32 = ctypes.windll.kernel32
1520 LockFileEx = kernel32.LockFileEx
1521 LockFileEx.argtypes = [
1522 ctypes.wintypes.HANDLE, # hFile
1523 ctypes.wintypes.DWORD, # dwFlags
1524 ctypes.wintypes.DWORD, # dwReserved
1525 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1526 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1527 ctypes.POINTER(OVERLAPPED) # Overlapped
1529 LockFileEx.restype = ctypes.wintypes.BOOL
1530 UnlockFileEx = kernel32.UnlockFileEx
1531 UnlockFileEx.argtypes = [
1532 ctypes.wintypes.HANDLE, # hFile
1533 ctypes.wintypes.DWORD, # dwReserved
1534 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1535 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1536 ctypes.POINTER(OVERLAPPED) # Overlapped
1538 UnlockFileEx.restype = ctypes.wintypes.BOOL
1539 whole_low = 0xffffffff
1540 whole_high = 0x7fffffff
1542 def _lock_file(f, exclusive):
1543 overlapped = OVERLAPPED()
1544 overlapped.Offset = 0
1545 overlapped.OffsetHigh = 0
1546 overlapped.hEvent = 0
1547 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1548 handle = msvcrt.get_osfhandle(f.fileno())
1549 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1550 whole_low, whole_high, f._lock_file_overlapped_p):
1551 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1553 def _unlock_file(f):
1554 assert f._lock_file_overlapped_p
1555 handle = msvcrt.get_osfhandle(f.fileno())
1556 if not UnlockFileEx(handle, 0,
1557 whole_low, whole_high, f._lock_file_overlapped_p):
1558 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1561 # Some platforms, such as Jython, is missing fcntl
1565 def _lock_file(f, exclusive):
1566 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1568 def _unlock_file(f):
1569 fcntl.flock(f, fcntl.LOCK_UN)
1571 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1573 def _lock_file(f, exclusive):
1574 raise IOError(UNSUPPORTED_MSG)
1576 def _unlock_file(f):
1577 raise IOError(UNSUPPORTED_MSG)
1580 class locked_file(object):
1581 def __init__(self, filename, mode, encoding=None):
1582 assert mode in ['r', 'a', 'w']
1583 self.f = io.open(filename, mode, encoding=encoding)
1586 def __enter__(self):
1587 exclusive = self.mode != 'r'
1589 _lock_file(self.f, exclusive)
1595 def __exit__(self, etype, value, traceback):
1597 _unlock_file(self.f)
1604 def write(self, *args):
1605 return self.f.write(*args)
1607 def read(self, *args):
1608 return self.f.read(*args)
1611 def get_filesystem_encoding():
1612 encoding = sys.getfilesystemencoding()
1613 return encoding if encoding is not None else 'utf-8'
1616 def shell_quote(args):
1618 encoding = get_filesystem_encoding()
1620 if isinstance(a, bytes):
1621 # We may get a filename encoded with 'encodeFilename'
1622 a = a.decode(encoding)
1623 quoted_args.append(compat_shlex_quote(a))
1624 return ' '.join(quoted_args)
1627 def smuggle_url(url, data):
1628 """ Pass additional data in a URL for internal use. """
1630 url, idata = unsmuggle_url(url, {})
1632 sdata = compat_urllib_parse_urlencode(
1633 {'__youtubedl_smuggle': json.dumps(data)})
1634 return url + '#' + sdata
1637 def unsmuggle_url(smug_url, default=None):
1638 if '#__youtubedl_smuggle' not in smug_url:
1639 return smug_url, default
1640 url, _, sdata = smug_url.rpartition('#')
1641 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1642 data = json.loads(jsond)
1646 def format_bytes(bytes):
1649 if type(bytes) is str:
1650 bytes = float(bytes)
1654 exponent = int(math.log(bytes, 1024.0))
1655 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1656 converted = float(bytes) / float(1024 ** exponent)
1657 return '%.2f%s' % (converted, suffix)
1660 def lookup_unit_table(unit_table, s):
1661 units_re = '|'.join(re.escape(u) for u in unit_table)
1663 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1666 num_str = m.group('num').replace(',', '.')
1667 mult = unit_table[m.group('unit')]
1668 return int(float(num_str) * mult)
1671 def parse_filesize(s):
1675 # The lower-case forms are of course incorrect and unofficial,
1676 # but we support those too
1693 'megabytes': 1000 ** 2,
1694 'mebibytes': 1024 ** 2,
1700 'gigabytes': 1000 ** 3,
1701 'gibibytes': 1024 ** 3,
1707 'terabytes': 1000 ** 4,
1708 'tebibytes': 1024 ** 4,
1714 'petabytes': 1000 ** 5,
1715 'pebibytes': 1024 ** 5,
1721 'exabytes': 1000 ** 6,
1722 'exbibytes': 1024 ** 6,
1728 'zettabytes': 1000 ** 7,
1729 'zebibytes': 1024 ** 7,
1735 'yottabytes': 1000 ** 8,
1736 'yobibytes': 1024 ** 8,
1739 return lookup_unit_table(_UNIT_TABLE, s)
1748 if re.match(r'^[\d,.]+$', s):
1749 return str_to_int(s)
1760 return lookup_unit_table(_UNIT_TABLE, s)
1763 def parse_resolution(s):
1767 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1770 'width': int(mobj.group('w')),
1771 'height': int(mobj.group('h')),
1774 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1776 return {'height': int(mobj.group(1))}
1778 mobj = re.search(r'\b([48])[kK]\b', s)
1780 return {'height': int(mobj.group(1)) * 540}
1785 def month_by_name(name, lang='en'):
1786 """ Return the number of a month by (locale-independently) English name """
1788 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1791 return month_names.index(name) + 1
1796 def month_by_abbreviation(abbrev):
1797 """ Return the number of a month by (locale-independently) English
1801 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1806 def fix_xml_ampersands(xml_str):
1807 """Replace all the '&' by '&' in XML"""
1809 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1814 def setproctitle(title):
1815 assert isinstance(title, compat_str)
1817 # ctypes in Jython is not complete
1818 # http://bugs.jython.org/issue2148
1819 if sys.platform.startswith('java'):
1823 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1827 # LoadLibrary in Windows Python 2.7.13 only expects
1828 # a bytestring, but since unicode_literals turns
1829 # every string into a unicode string, it fails.
1831 title_bytes = title.encode('utf-8')
1832 buf = ctypes.create_string_buffer(len(title_bytes))
1833 buf.value = title_bytes
1835 libc.prctl(15, buf, 0, 0, 0)
1836 except AttributeError:
1837 return # Strange libc, just skip this
1840 def remove_start(s, start):
1841 return s[len(start):] if s is not None and s.startswith(start) else s
1844 def remove_end(s, end):
1845 return s[:-len(end)] if s is not None and s.endswith(end) else s
1848 def remove_quotes(s):
1849 if s is None or len(s) < 2:
1851 for quote in ('"', "'", ):
1852 if s[0] == quote and s[-1] == quote:
1857 def url_basename(url):
1858 path = compat_urlparse.urlparse(url).path
1859 return path.strip('/').split('/')[-1]
1863 return re.match(r'https?://[^?#&]+/', url).group()
1866 def urljoin(base, path):
1867 if isinstance(path, bytes):
1868 path = path.decode('utf-8')
1869 if not isinstance(path, compat_str) or not path:
1871 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1873 if isinstance(base, bytes):
1874 base = base.decode('utf-8')
1875 if not isinstance(base, compat_str) or not re.match(
1876 r'^(?:https?:)?//', base):
1878 return compat_urlparse.urljoin(base, path)
1881 class HEADRequest(compat_urllib_request.Request):
1882 def get_method(self):
1886 class PUTRequest(compat_urllib_request.Request):
1887 def get_method(self):
1891 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1894 v = getattr(v, get_attr, None)
1900 return int(v) * invscale // scale
1905 def str_or_none(v, default=None):
1906 return default if v is None else compat_str(v)
1909 def str_to_int(int_str):
1910 """ A more relaxed version of int_or_none """
1913 int_str = re.sub(r'[,\.\+]', '', int_str)
1917 def float_or_none(v, scale=1, invscale=1, default=None):
1921 return float(v) * invscale / scale
1926 def bool_or_none(v, default=None):
1927 return v if isinstance(v, bool) else default
1930 def strip_or_none(v):
1931 return None if v is None else v.strip()
1934 def url_or_none(url):
1935 if not url or not isinstance(url, compat_str):
1938 return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
1941 def parse_duration(s):
1942 if not isinstance(s, compat_basestring):
1947 days, hours, mins, secs, ms = [None] * 5
1948 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1950 days, hours, mins, secs, ms = m.groups()
1955 [0-9]+\s*y(?:ears?)?\s*
1958 [0-9]+\s*m(?:onths?)?\s*
1961 [0-9]+\s*w(?:eeks?)?\s*
1964 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1968 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1971 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1974 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1977 days, hours, mins, secs, ms = m.groups()
1979 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1981 hours, mins = m.groups()
1987 duration += float(secs)
1989 duration += float(mins) * 60
1991 duration += float(hours) * 60 * 60
1993 duration += float(days) * 24 * 60 * 60
1995 duration += float(ms)
1999 def prepend_extension(filename, ext, expected_real_ext=None):
2000 name, real_ext = os.path.splitext(filename)
2002 '{0}.{1}{2}'.format(name, ext, real_ext)
2003 if not expected_real_ext or real_ext[1:] == expected_real_ext
2004 else '{0}.{1}'.format(filename, ext))
2007 def replace_extension(filename, ext, expected_real_ext=None):
2008 name, real_ext = os.path.splitext(filename)
2009 return '{0}.{1}'.format(
2010 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2014 def check_executable(exe, args=[]):
2015 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2016 args can be a list of arguments for a short output (like -version) """
2018 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
2024 def get_exe_version(exe, args=['--version'],
2025 version_re=None, unrecognized='present'):
2026 """ Returns the version of the specified executable,
2027 or False if the executable is not present """
2029 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2030 # SIGTTOU if youtube-dl is run in the background.
2031 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
2032 out, _ = subprocess.Popen(
2033 [encodeArgument(exe)] + args,
2034 stdin=subprocess.PIPE,
2035 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
2038 if isinstance(out, bytes): # Python 2.x
2039 out = out.decode('ascii', 'ignore')
2040 return detect_exe_version(out, version_re, unrecognized)
2043 def detect_exe_version(output, version_re=None, unrecognized='present'):
2044 assert isinstance(output, compat_str)
2045 if version_re is None:
2046 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2047 m = re.search(version_re, output)
2054 class PagedList(object):
2056 # This is only useful for tests
2057 return len(self.getslice())
2060 class OnDemandPagedList(PagedList):
2061 def __init__(self, pagefunc, pagesize, use_cache=True):
2062 self._pagefunc = pagefunc
2063 self._pagesize = pagesize
2064 self._use_cache = use_cache
2068 def getslice(self, start=0, end=None):
2070 for pagenum in itertools.count(start // self._pagesize):
2071 firstid = pagenum * self._pagesize
2072 nextfirstid = pagenum * self._pagesize + self._pagesize
2073 if start >= nextfirstid:
2078 page_results = self._cache.get(pagenum)
2079 if page_results is None:
2080 page_results = list(self._pagefunc(pagenum))
2082 self._cache[pagenum] = page_results
2085 start % self._pagesize
2086 if firstid <= start < nextfirstid
2090 ((end - 1) % self._pagesize) + 1
2091 if (end is not None and firstid <= end <= nextfirstid)
2094 if startv != 0 or endv is not None:
2095 page_results = page_results[startv:endv]
2096 res.extend(page_results)
2098 # A little optimization - if current page is not "full", ie. does
2099 # not contain page_size videos then we can assume that this page
2100 # is the last one - there are no more ids on further pages -
2101 # i.e. no need to query again.
2102 if len(page_results) + startv < self._pagesize:
2105 # If we got the whole page, but the next page is not interesting,
2106 # break out early as well
2107 if end == nextfirstid:
2112 class InAdvancePagedList(PagedList):
2113 def __init__(self, pagefunc, pagecount, pagesize):
2114 self._pagefunc = pagefunc
2115 self._pagecount = pagecount
2116 self._pagesize = pagesize
2118 def getslice(self, start=0, end=None):
2120 start_page = start // self._pagesize
2122 self._pagecount if end is None else (end // self._pagesize + 1))
2123 skip_elems = start - start_page * self._pagesize
2124 only_more = None if end is None else end - start
2125 for pagenum in range(start_page, end_page):
2126 page = list(self._pagefunc(pagenum))
2128 page = page[skip_elems:]
2130 if only_more is not None:
2131 if len(page) < only_more:
2132 only_more -= len(page)
2134 page = page[:only_more]
2141 def uppercase_escape(s):
2142 unicode_escape = codecs.getdecoder('unicode_escape')
2144 r'\\U[0-9a-fA-F]{8}',
2145 lambda m: unicode_escape(m.group(0))[0],
2149 def lowercase_escape(s):
2150 unicode_escape = codecs.getdecoder('unicode_escape')
2152 r'\\u[0-9a-fA-F]{4}',
2153 lambda m: unicode_escape(m.group(0))[0],
2157 def escape_rfc3986(s):
2158 """Escape non-ASCII characters as suggested by RFC 3986"""
2159 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2160 s = s.encode('utf-8')
2161 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2164 def escape_url(url):
2165 """Escape URL as suggested by RFC 3986"""
2166 url_parsed = compat_urllib_parse_urlparse(url)
2167 return url_parsed._replace(
2168 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2169 path=escape_rfc3986(url_parsed.path),
2170 params=escape_rfc3986(url_parsed.params),
2171 query=escape_rfc3986(url_parsed.query),
2172 fragment=escape_rfc3986(url_parsed.fragment)
2176 def read_batch_urls(batch_fd):
2178 if not isinstance(url, compat_str):
2179 url = url.decode('utf-8', 'replace')
2180 BOM_UTF8 = '\xef\xbb\xbf'
2181 if url.startswith(BOM_UTF8):
2182 url = url[len(BOM_UTF8):]
2184 if url.startswith(('#', ';', ']')):
2188 with contextlib.closing(batch_fd) as fd:
2189 return [url for url in map(fixup, fd) if url]
2192 def urlencode_postdata(*args, **kargs):
2193 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2196 def update_url_query(url, query):
2199 parsed_url = compat_urlparse.urlparse(url)
2200 qs = compat_parse_qs(parsed_url.query)
2202 return compat_urlparse.urlunparse(parsed_url._replace(
2203 query=compat_urllib_parse_urlencode(qs, True)))
2206 def update_Request(req, url=None, data=None, headers={}, query={}):
2207 req_headers = req.headers.copy()
2208 req_headers.update(headers)
2209 req_data = data or req.data
2210 req_url = update_url_query(url or req.get_full_url(), query)
2211 req_get_method = req.get_method()
2212 if req_get_method == 'HEAD':
2213 req_type = HEADRequest
2214 elif req_get_method == 'PUT':
2215 req_type = PUTRequest
2217 req_type = compat_urllib_request.Request
2219 req_url, data=req_data, headers=req_headers,
2220 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2221 if hasattr(req, 'timeout'):
2222 new_req.timeout = req.timeout
2226 def _multipart_encode_impl(data, boundary):
2227 content_type = 'multipart/form-data; boundary=%s' % boundary
2230 for k, v in data.items():
2231 out += b'--' + boundary.encode('ascii') + b'\r\n'
2232 if isinstance(k, compat_str):
2233 k = k.encode('utf-8')
2234 if isinstance(v, compat_str):
2235 v = v.encode('utf-8')
2236 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2237 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2238 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2239 if boundary.encode('ascii') in content:
2240 raise ValueError('Boundary overlaps with data')
2243 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2245 return out, content_type
2248 def multipart_encode(data, boundary=None):
2250 Encode a dict to RFC 7578-compliant form-data
2253 A dict where keys and values can be either Unicode or bytes-like
2256 If specified a Unicode object, it's used as the boundary. Otherwise
2257 a random boundary is generated.
2259 Reference: https://tools.ietf.org/html/rfc7578
2261 has_specified_boundary = boundary is not None
2264 if boundary is None:
2265 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2268 out, content_type = _multipart_encode_impl(data, boundary)
2271 if has_specified_boundary:
2275 return out, content_type
2278 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2279 if isinstance(key_or_keys, (list, tuple)):
2280 for key in key_or_keys:
2281 if key not in d or d[key] is None or skip_false_values and not d[key]:
2285 return d.get(key_or_keys, default)
2288 def try_get(src, getter, expected_type=None):
2289 if not isinstance(getter, (list, tuple)):
2294 except (AttributeError, KeyError, TypeError, IndexError):
2297 if expected_type is None or isinstance(v, expected_type):
2301 def merge_dicts(*dicts):
2303 for a_dict in dicts:
2304 for k, v in a_dict.items():
2307 if (k not in merged or
2308 (isinstance(v, compat_str) and v and
2309 isinstance(merged[k], compat_str) and
2315 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2316 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2328 TV_PARENTAL_GUIDELINES = {
2338 def parse_age_limit(s):
2340 return s if 0 <= s <= 21 else None
2341 if not isinstance(s, compat_basestring):
2343 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2345 return int(m.group('age'))
2347 return US_RATINGS[s]
2348 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2350 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2354 def strip_jsonp(code):
2357 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2358 (?:\s*&&\s*(?P=func_name))?
2359 \s*\(\s*(?P<callback_data>.*)\);?
2360 \s*?(?://[^\n]*)*$''',
2361 r'\g<callback_data>', code)
2364 def js_to_json(code):
2365 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2366 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2368 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2369 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2374 if v in ('true', 'false', 'null'):
2376 elif v.startswith('/*') or v.startswith('//') or v == ',':
2379 if v[0] in ("'", '"'):
2380 v = re.sub(r'(?s)\\.|"', lambda m: {
2385 }.get(m.group(0), m.group(0)), v[1:-1])
2387 for regex, base in INTEGER_TABLE:
2388 im = re.match(regex, v)
2390 i = int(im.group(1), base)
2391 return '"%d":' % i if v.endswith(':') else '%d' % i
2395 return re.sub(r'''(?sx)
2396 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2397 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2398 {comment}|,(?={skip}[\]}}])|
2399 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2400 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2402 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2405 def qualities(quality_ids):
2406 """ Get a numeric quality value out of a list of possible values """
2409 return quality_ids.index(qid)
2415 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2418 def limit_length(s, length):
2419 """ Add ellipses to overly long strings """
2424 return s[:length - len(ELLIPSES)] + ELLIPSES
2428 def version_tuple(v):
2429 return tuple(int(e) for e in re.split(r'[-.]', v))
2432 def is_outdated_version(version, limit, assume_new=True):
2434 return not assume_new
2436 return version_tuple(version) < version_tuple(limit)
2438 return not assume_new
2441 def ytdl_is_updateable():
2442 """ Returns if youtube-dl can be updated with -U """
2443 from zipimport import zipimporter
2445 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2448 def args_to_str(args):
2449 # Get a short string representation for a subprocess command
2450 return ' '.join(compat_shlex_quote(a) for a in args)
2453 def error_to_compat_str(err):
2455 # On python 2 error byte string must be decoded with proper
2456 # encoding rather than ascii
2457 if sys.version_info[0] < 3:
2458 err_str = err_str.decode(preferredencoding())
2462 def mimetype2ext(mt):
2468 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2469 # it's the most popular one
2470 'audio/mpeg': 'mp3',
2475 _, _, res = mt.rpartition('/')
2476 res = res.split(';')[0].strip().lower()
2480 'smptett+xml': 'tt',
2484 'x-mp4-fragmented': 'mp4',
2485 'x-ms-sami': 'sami',
2488 'x-mpegurl': 'm3u8',
2489 'vnd.apple.mpegurl': 'm3u8',
2493 'vnd.ms-sstr+xml': 'ism',
2499 def parse_codecs(codecs_str):
2500 # http://tools.ietf.org/html/rfc6381
2503 splited_codecs = list(filter(None, map(
2504 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2505 vcodec, acodec = None, None
2506 for full_codec in splited_codecs:
2507 codec = full_codec.split('.')[0]
2508 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'):
2511 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2515 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2516 if not vcodec and not acodec:
2517 if len(splited_codecs) == 2:
2522 elif len(splited_codecs) == 1:
2529 'vcodec': vcodec or 'none',
2530 'acodec': acodec or 'none',
2535 def urlhandle_detect_ext(url_handle):
2536 getheader = url_handle.headers.get
2538 cd = getheader('Content-Disposition')
2540 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2542 e = determine_ext(m.group('filename'), default_ext=None)
2546 return mimetype2ext(getheader('Content-Type'))
2549 def encode_data_uri(data, mime_type):
2550 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2553 def age_restricted(content_limit, age_limit):
2554 """ Returns True iff the content should be blocked """
2556 if age_limit is None: # No limit set
2558 if content_limit is None:
2559 return False # Content available for everyone
2560 return age_limit < content_limit
2563 def is_html(first_bytes):
2564 """ Detect whether a file contains HTML by examining its first bytes. """
2567 (b'\xef\xbb\xbf', 'utf-8'),
2568 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2569 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2570 (b'\xff\xfe', 'utf-16-le'),
2571 (b'\xfe\xff', 'utf-16-be'),
2573 for bom, enc in BOMS:
2574 if first_bytes.startswith(bom):
2575 s = first_bytes[len(bom):].decode(enc, 'replace')
2578 s = first_bytes.decode('utf-8', 'replace')
2580 return re.match(r'^\s*<', s)
2583 def determine_protocol(info_dict):
2584 protocol = info_dict.get('protocol')
2585 if protocol is not None:
2588 url = info_dict['url']
2589 if url.startswith('rtmp'):
2591 elif url.startswith('mms'):
2593 elif url.startswith('rtsp'):
2596 ext = determine_ext(url)
2602 return compat_urllib_parse_urlparse(url).scheme
2605 def render_table(header_row, data):
2606 """ Render a list of rows, each as a list of values """
2607 table = [header_row] + data
2608 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2609 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2610 return '\n'.join(format_str % tuple(row) for row in table)
2613 def _match_one(filter_part, dct):
2614 COMPARISON_OPERATORS = {
2622 operator_rex = re.compile(r'''(?x)\s*
2624 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2626 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2627 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2628 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2631 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2632 m = operator_rex.search(filter_part)
2634 op = COMPARISON_OPERATORS[m.group('op')]
2635 actual_value = dct.get(m.group('key'))
2636 if (m.group('quotedstrval') is not None or
2637 m.group('strval') is not None or
2638 # If the original field is a string and matching comparisonvalue is
2639 # a number we should respect the origin of the original field
2640 # and process comparison value as a string (see
2641 # https://github.com/rg3/youtube-dl/issues/11082).
2642 actual_value is not None and m.group('intval') is not None and
2643 isinstance(actual_value, compat_str)):
2644 if m.group('op') not in ('=', '!='):
2646 'Operator %s does not support string values!' % m.group('op'))
2647 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2648 quote = m.group('quote')
2649 if quote is not None:
2650 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2653 comparison_value = int(m.group('intval'))
2655 comparison_value = parse_filesize(m.group('intval'))
2656 if comparison_value is None:
2657 comparison_value = parse_filesize(m.group('intval') + 'B')
2658 if comparison_value is None:
2660 'Invalid integer value %r in filter part %r' % (
2661 m.group('intval'), filter_part))
2662 if actual_value is None:
2663 return m.group('none_inclusive')
2664 return op(actual_value, comparison_value)
2667 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2668 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
2670 operator_rex = re.compile(r'''(?x)\s*
2671 (?P<op>%s)\s*(?P<key>[a-z_]+)
2673 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2674 m = operator_rex.search(filter_part)
2676 op = UNARY_OPERATORS[m.group('op')]
2677 actual_value = dct.get(m.group('key'))
2678 return op(actual_value)
2680 raise ValueError('Invalid filter part %r' % filter_part)
2683 def match_str(filter_str, dct):
2684 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2687 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2690 def match_filter_func(filter_str):
2691 def _match_func(info_dict):
2692 if match_str(filter_str, info_dict):
2695 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2696 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2700 def parse_dfxp_time_expr(time_expr):
2704 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2706 return float(mobj.group('time_offset'))
2708 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2710 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2713 def srt_subtitles_timecode(seconds):
2714 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2717 def dfxp2srt(dfxp_data):
2719 @param dfxp_data A bytes-like object containing DFXP data
2720 @returns A unicode object containing converted SRT data
2722 LEGACY_NAMESPACES = (
2723 (b'http://www.w3.org/ns/ttml', [
2724 b'http://www.w3.org/2004/11/ttaf1',
2725 b'http://www.w3.org/2006/04/ttaf1',
2726 b'http://www.w3.org/2006/10/ttaf1',
2728 (b'http://www.w3.org/ns/ttml#styling', [
2729 b'http://www.w3.org/ns/ttml#style',
2733 SUPPORTED_STYLING = [
2742 _x = functools.partial(xpath_with_ns, ns_map={
2743 'xml': 'http://www.w3.org/XML/1998/namespace',
2744 'ttml': 'http://www.w3.org/ns/ttml',
2745 'tts': 'http://www.w3.org/ns/ttml#styling',
2751 class TTMLPElementParser(object):
2753 _unclosed_elements = []
2754 _applied_styles = []
2756 def start(self, tag, attrib):
2757 if tag in (_x('ttml:br'), 'br'):
2760 unclosed_elements = []
2762 element_style_id = attrib.get('style')
2764 style.update(default_style)
2765 if element_style_id:
2766 style.update(styles.get(element_style_id, {}))
2767 for prop in SUPPORTED_STYLING:
2768 prop_val = attrib.get(_x('tts:' + prop))
2770 style[prop] = prop_val
2773 for k, v in sorted(style.items()):
2774 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2777 font += ' color="%s"' % v
2778 elif k == 'fontSize':
2779 font += ' size="%s"' % v
2780 elif k == 'fontFamily':
2781 font += ' face="%s"' % v
2782 elif k == 'fontWeight' and v == 'bold':
2784 unclosed_elements.append('b')
2785 elif k == 'fontStyle' and v == 'italic':
2787 unclosed_elements.append('i')
2788 elif k == 'textDecoration' and v == 'underline':
2790 unclosed_elements.append('u')
2792 self._out += '<font' + font + '>'
2793 unclosed_elements.append('font')
2795 if self._applied_styles:
2796 applied_style.update(self._applied_styles[-1])
2797 applied_style.update(style)
2798 self._applied_styles.append(applied_style)
2799 self._unclosed_elements.append(unclosed_elements)
2802 if tag not in (_x('ttml:br'), 'br'):
2803 unclosed_elements = self._unclosed_elements.pop()
2804 for element in reversed(unclosed_elements):
2805 self._out += '</%s>' % element
2806 if unclosed_elements and self._applied_styles:
2807 self._applied_styles.pop()
2809 def data(self, data):
2813 return self._out.strip()
2815 def parse_node(node):
2816 target = TTMLPElementParser()
2817 parser = xml.etree.ElementTree.XMLParser(target=target)
2818 parser.feed(xml.etree.ElementTree.tostring(node))
2819 return parser.close()
2821 for k, v in LEGACY_NAMESPACES:
2823 dfxp_data = dfxp_data.replace(ns, k)
2825 dfxp = compat_etree_fromstring(dfxp_data)
2827 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2830 raise ValueError('Invalid dfxp/TTML subtitle')
2834 for style in dfxp.findall(_x('.//ttml:style')):
2835 style_id = style.get('id') or style.get(_x('xml:id'))
2838 parent_style_id = style.get('style')
2840 if parent_style_id not in styles:
2843 styles[style_id] = styles[parent_style_id].copy()
2844 for prop in SUPPORTED_STYLING:
2845 prop_val = style.get(_x('tts:' + prop))
2847 styles.setdefault(style_id, {})[prop] = prop_val
2853 for p in ('body', 'div'):
2854 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2857 style = styles.get(ele.get('style'))
2860 default_style.update(style)
2862 for para, index in zip(paras, itertools.count(1)):
2863 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2864 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2865 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2866 if begin_time is None:
2871 end_time = begin_time + dur
2872 out.append('%d\n%s --> %s\n%s\n\n' % (
2874 srt_subtitles_timecode(begin_time),
2875 srt_subtitles_timecode(end_time),
2881 def cli_option(params, command_option, param):
2882 param = params.get(param)
2884 param = compat_str(param)
2885 return [command_option, param] if param is not None else []
2888 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2889 param = params.get(param)
2892 assert isinstance(param, bool)
2894 return [command_option + separator + (true_value if param else false_value)]
2895 return [command_option, true_value if param else false_value]
2898 def cli_valueless_option(params, command_option, param, expected_value=True):
2899 param = params.get(param)
2900 return [command_option] if param == expected_value else []
2903 def cli_configuration_args(params, param, default=[]):
2904 ex_args = params.get(param)
2907 assert isinstance(ex_args, list)
2911 class ISO639Utils(object):
2912 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2971 'iw': 'heb', # Replaced by he in 1989 revision
2981 'in': 'ind', # Replaced by id in 1989 revision
3096 'ji': 'yid', # Replaced by yi in 1989 revision
3104 def short2long(cls, code):
3105 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3106 return cls._lang_map.get(code[:2])
3109 def long2short(cls, code):
3110 """Convert language code from ISO 639-2/T to ISO 639-1"""
3111 for short_name, long_name in cls._lang_map.items():
3112 if long_name == code:
3116 class ISO3166Utils(object):
3117 # From http://data.okfn.org/data/core/country-list
3119 'AF': 'Afghanistan',
3120 'AX': 'Åland Islands',
3123 'AS': 'American Samoa',
3128 'AG': 'Antigua and Barbuda',
3145 'BO': 'Bolivia, Plurinational State of',
3146 'BQ': 'Bonaire, Sint Eustatius and Saba',
3147 'BA': 'Bosnia and Herzegovina',
3149 'BV': 'Bouvet Island',
3151 'IO': 'British Indian Ocean Territory',
3152 'BN': 'Brunei Darussalam',
3154 'BF': 'Burkina Faso',
3160 'KY': 'Cayman Islands',
3161 'CF': 'Central African Republic',
3165 'CX': 'Christmas Island',
3166 'CC': 'Cocos (Keeling) Islands',
3170 'CD': 'Congo, the Democratic Republic of the',
3171 'CK': 'Cook Islands',
3173 'CI': 'Côte d\'Ivoire',
3178 'CZ': 'Czech Republic',
3182 'DO': 'Dominican Republic',
3185 'SV': 'El Salvador',
3186 'GQ': 'Equatorial Guinea',
3190 'FK': 'Falkland Islands (Malvinas)',
3191 'FO': 'Faroe Islands',
3195 'GF': 'French Guiana',
3196 'PF': 'French Polynesia',
3197 'TF': 'French Southern Territories',
3212 'GW': 'Guinea-Bissau',
3215 'HM': 'Heard Island and McDonald Islands',
3216 'VA': 'Holy See (Vatican City State)',
3223 'IR': 'Iran, Islamic Republic of',
3226 'IM': 'Isle of Man',
3236 'KP': 'Korea, Democratic People\'s Republic of',
3237 'KR': 'Korea, Republic of',
3240 'LA': 'Lao People\'s Democratic Republic',
3246 'LI': 'Liechtenstein',
3250 'MK': 'Macedonia, the Former Yugoslav Republic of',
3257 'MH': 'Marshall Islands',
3263 'FM': 'Micronesia, Federated States of',
3264 'MD': 'Moldova, Republic of',
3275 'NL': 'Netherlands',
3276 'NC': 'New Caledonia',
3277 'NZ': 'New Zealand',
3282 'NF': 'Norfolk Island',
3283 'MP': 'Northern Mariana Islands',
3288 'PS': 'Palestine, State of',
3290 'PG': 'Papua New Guinea',
3293 'PH': 'Philippines',
3297 'PR': 'Puerto Rico',
3301 'RU': 'Russian Federation',
3303 'BL': 'Saint Barthélemy',
3304 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3305 'KN': 'Saint Kitts and Nevis',
3306 'LC': 'Saint Lucia',
3307 'MF': 'Saint Martin (French part)',
3308 'PM': 'Saint Pierre and Miquelon',
3309 'VC': 'Saint Vincent and the Grenadines',
3312 'ST': 'Sao Tome and Principe',
3313 'SA': 'Saudi Arabia',
3317 'SL': 'Sierra Leone',
3319 'SX': 'Sint Maarten (Dutch part)',
3322 'SB': 'Solomon Islands',
3324 'ZA': 'South Africa',
3325 'GS': 'South Georgia and the South Sandwich Islands',
3326 'SS': 'South Sudan',
3331 'SJ': 'Svalbard and Jan Mayen',
3334 'CH': 'Switzerland',
3335 'SY': 'Syrian Arab Republic',
3336 'TW': 'Taiwan, Province of China',
3338 'TZ': 'Tanzania, United Republic of',
3340 'TL': 'Timor-Leste',
3344 'TT': 'Trinidad and Tobago',
3347 'TM': 'Turkmenistan',
3348 'TC': 'Turks and Caicos Islands',
3352 'AE': 'United Arab Emirates',
3353 'GB': 'United Kingdom',
3354 'US': 'United States',
3355 'UM': 'United States Minor Outlying Islands',
3359 'VE': 'Venezuela, Bolivarian Republic of',
3361 'VG': 'Virgin Islands, British',
3362 'VI': 'Virgin Islands, U.S.',
3363 'WF': 'Wallis and Futuna',
3364 'EH': 'Western Sahara',
3371 def short2full(cls, code):
3372 """Convert an ISO 3166-2 country code to the corresponding full name"""
3373 return cls._country_map.get(code.upper())
3376 class GeoUtils(object):
3377 # Major IPv4 address blocks per country
3379 'AD': '85.94.160.0/19',
3380 'AE': '94.200.0.0/13',
3381 'AF': '149.54.0.0/17',
3382 'AG': '209.59.64.0/18',
3383 'AI': '204.14.248.0/21',
3384 'AL': '46.99.0.0/16',
3385 'AM': '46.70.0.0/15',
3386 'AO': '105.168.0.0/13',
3387 'AP': '159.117.192.0/21',
3388 'AR': '181.0.0.0/12',
3389 'AS': '202.70.112.0/20',
3390 'AT': '84.112.0.0/13',
3391 'AU': '1.128.0.0/11',
3392 'AW': '181.41.0.0/18',
3393 'AZ': '5.191.0.0/16',
3394 'BA': '31.176.128.0/17',
3395 'BB': '65.48.128.0/17',
3396 'BD': '114.130.0.0/16',
3398 'BF': '129.45.128.0/17',
3399 'BG': '95.42.0.0/15',
3400 'BH': '37.131.0.0/17',
3401 'BI': '154.117.192.0/18',
3402 'BJ': '137.255.0.0/16',
3403 'BL': '192.131.134.0/24',
3404 'BM': '196.12.64.0/18',
3405 'BN': '156.31.0.0/16',
3406 'BO': '161.56.0.0/16',
3407 'BQ': '161.0.80.0/20',
3408 'BR': '152.240.0.0/12',
3409 'BS': '24.51.64.0/18',
3410 'BT': '119.2.96.0/19',
3411 'BW': '168.167.0.0/16',
3412 'BY': '178.120.0.0/13',
3413 'BZ': '179.42.192.0/18',
3414 'CA': '99.224.0.0/11',
3415 'CD': '41.243.0.0/16',
3416 'CF': '196.32.200.0/21',
3417 'CG': '197.214.128.0/17',
3418 'CH': '85.0.0.0/13',
3419 'CI': '154.232.0.0/14',
3420 'CK': '202.65.32.0/19',
3421 'CL': '152.172.0.0/14',
3422 'CM': '165.210.0.0/15',
3423 'CN': '36.128.0.0/10',
3424 'CO': '181.240.0.0/12',
3425 'CR': '201.192.0.0/12',
3426 'CU': '152.206.0.0/15',
3427 'CV': '165.90.96.0/19',
3428 'CW': '190.88.128.0/17',
3429 'CY': '46.198.0.0/15',
3430 'CZ': '88.100.0.0/14',
3432 'DJ': '197.241.0.0/17',
3433 'DK': '87.48.0.0/12',
3434 'DM': '192.243.48.0/20',
3435 'DO': '152.166.0.0/15',
3436 'DZ': '41.96.0.0/12',
3437 'EC': '186.68.0.0/15',
3438 'EE': '90.190.0.0/15',
3439 'EG': '156.160.0.0/11',
3440 'ER': '196.200.96.0/20',
3441 'ES': '88.0.0.0/11',
3442 'ET': '196.188.0.0/14',
3443 'EU': '2.16.0.0/13',
3444 'FI': '91.152.0.0/13',
3445 'FJ': '144.120.0.0/16',
3446 'FM': '119.252.112.0/20',
3447 'FO': '88.85.32.0/19',
3449 'GA': '41.158.0.0/15',
3451 'GD': '74.122.88.0/21',
3452 'GE': '31.146.0.0/16',
3453 'GF': '161.22.64.0/18',
3454 'GG': '62.68.160.0/19',
3455 'GH': '45.208.0.0/14',
3456 'GI': '85.115.128.0/19',
3457 'GL': '88.83.0.0/19',
3458 'GM': '160.182.0.0/15',
3459 'GN': '197.149.192.0/18',
3460 'GP': '104.250.0.0/19',
3461 'GQ': '105.235.224.0/20',
3462 'GR': '94.64.0.0/13',
3463 'GT': '168.234.0.0/16',
3464 'GU': '168.123.0.0/16',
3465 'GW': '197.214.80.0/20',
3466 'GY': '181.41.64.0/18',
3467 'HK': '113.252.0.0/14',
3468 'HN': '181.210.0.0/16',
3469 'HR': '93.136.0.0/13',
3470 'HT': '148.102.128.0/17',
3471 'HU': '84.0.0.0/14',
3472 'ID': '39.192.0.0/10',
3473 'IE': '87.32.0.0/12',
3474 'IL': '79.176.0.0/13',
3475 'IM': '5.62.80.0/20',
3476 'IN': '117.192.0.0/10',
3477 'IO': '203.83.48.0/21',
3478 'IQ': '37.236.0.0/14',
3479 'IR': '2.176.0.0/12',
3480 'IS': '82.221.0.0/16',
3481 'IT': '79.0.0.0/10',
3482 'JE': '87.244.64.0/18',
3483 'JM': '72.27.0.0/17',
3484 'JO': '176.29.0.0/16',
3485 'JP': '126.0.0.0/8',
3486 'KE': '105.48.0.0/12',
3487 'KG': '158.181.128.0/17',
3488 'KH': '36.37.128.0/17',
3489 'KI': '103.25.140.0/22',
3490 'KM': '197.255.224.0/20',
3491 'KN': '198.32.32.0/19',
3492 'KP': '175.45.176.0/22',
3493 'KR': '175.192.0.0/10',
3494 'KW': '37.36.0.0/14',
3495 'KY': '64.96.0.0/15',
3496 'KZ': '2.72.0.0/13',
3497 'LA': '115.84.64.0/18',
3498 'LB': '178.135.0.0/16',
3499 'LC': '192.147.231.0/24',
3500 'LI': '82.117.0.0/19',
3501 'LK': '112.134.0.0/15',
3502 'LR': '41.86.0.0/19',
3503 'LS': '129.232.0.0/17',
3504 'LT': '78.56.0.0/13',
3505 'LU': '188.42.0.0/16',
3506 'LV': '46.109.0.0/16',
3507 'LY': '41.252.0.0/14',
3508 'MA': '105.128.0.0/11',
3509 'MC': '88.209.64.0/18',
3510 'MD': '37.246.0.0/16',
3511 'ME': '178.175.0.0/17',
3512 'MF': '74.112.232.0/21',
3513 'MG': '154.126.0.0/17',
3514 'MH': '117.103.88.0/21',
3515 'MK': '77.28.0.0/15',
3516 'ML': '154.118.128.0/18',
3517 'MM': '37.111.0.0/17',
3518 'MN': '49.0.128.0/17',
3519 'MO': '60.246.0.0/16',
3520 'MP': '202.88.64.0/20',
3521 'MQ': '109.203.224.0/19',
3522 'MR': '41.188.64.0/18',
3523 'MS': '208.90.112.0/22',
3524 'MT': '46.11.0.0/16',
3525 'MU': '105.16.0.0/12',
3526 'MV': '27.114.128.0/18',
3527 'MW': '105.234.0.0/16',
3528 'MX': '187.192.0.0/11',
3529 'MY': '175.136.0.0/13',
3530 'MZ': '197.218.0.0/15',
3531 'NA': '41.182.0.0/16',
3532 'NC': '101.101.0.0/18',
3533 'NE': '197.214.0.0/18',
3534 'NF': '203.17.240.0/22',
3535 'NG': '105.112.0.0/12',
3536 'NI': '186.76.0.0/15',
3537 'NL': '145.96.0.0/11',
3538 'NO': '84.208.0.0/13',
3539 'NP': '36.252.0.0/15',
3540 'NR': '203.98.224.0/19',
3541 'NU': '49.156.48.0/22',
3542 'NZ': '49.224.0.0/14',
3543 'OM': '5.36.0.0/15',
3544 'PA': '186.72.0.0/15',
3545 'PE': '186.160.0.0/14',
3546 'PF': '123.50.64.0/18',
3547 'PG': '124.240.192.0/19',
3548 'PH': '49.144.0.0/13',
3549 'PK': '39.32.0.0/11',
3550 'PL': '83.0.0.0/11',
3551 'PM': '70.36.0.0/20',
3552 'PR': '66.50.0.0/16',
3553 'PS': '188.161.0.0/16',
3554 'PT': '85.240.0.0/13',
3555 'PW': '202.124.224.0/20',
3556 'PY': '181.120.0.0/14',
3557 'QA': '37.210.0.0/15',
3558 'RE': '139.26.0.0/16',
3559 'RO': '79.112.0.0/13',
3560 'RS': '178.220.0.0/14',
3561 'RU': '5.136.0.0/13',
3562 'RW': '105.178.0.0/15',
3563 'SA': '188.48.0.0/13',
3564 'SB': '202.1.160.0/19',
3565 'SC': '154.192.0.0/11',
3566 'SD': '154.96.0.0/13',
3567 'SE': '78.64.0.0/12',
3568 'SG': '152.56.0.0/14',
3569 'SI': '188.196.0.0/14',
3570 'SK': '78.98.0.0/15',
3571 'SL': '197.215.0.0/17',
3572 'SM': '89.186.32.0/19',
3573 'SN': '41.82.0.0/15',
3574 'SO': '197.220.64.0/19',
3575 'SR': '186.179.128.0/17',
3576 'SS': '105.235.208.0/21',
3577 'ST': '197.159.160.0/19',
3578 'SV': '168.243.0.0/16',
3579 'SX': '190.102.0.0/20',
3581 'SZ': '41.84.224.0/19',
3582 'TC': '65.255.48.0/20',
3583 'TD': '154.68.128.0/19',
3584 'TG': '196.168.0.0/14',
3585 'TH': '171.96.0.0/13',
3586 'TJ': '85.9.128.0/18',
3587 'TK': '27.96.24.0/21',
3588 'TL': '180.189.160.0/20',
3589 'TM': '95.85.96.0/19',
3590 'TN': '197.0.0.0/11',
3591 'TO': '175.176.144.0/21',
3592 'TR': '78.160.0.0/11',
3593 'TT': '186.44.0.0/15',
3594 'TV': '202.2.96.0/19',
3595 'TW': '120.96.0.0/11',
3596 'TZ': '156.156.0.0/14',
3597 'UA': '93.72.0.0/13',
3598 'UG': '154.224.0.0/13',
3600 'UY': '167.56.0.0/13',
3601 'UZ': '82.215.64.0/18',
3602 'VA': '212.77.0.0/19',
3603 'VC': '24.92.144.0/20',
3604 'VE': '186.88.0.0/13',
3605 'VG': '172.103.64.0/18',
3606 'VI': '146.226.0.0/16',
3607 'VN': '14.160.0.0/11',
3608 'VU': '202.80.32.0/20',
3609 'WF': '117.20.32.0/21',
3610 'WS': '202.4.32.0/19',
3611 'YE': '134.35.0.0/16',
3612 'YT': '41.242.116.0/22',
3613 'ZA': '41.0.0.0/11',
3614 'ZM': '165.56.0.0/13',
3615 'ZW': '41.85.192.0/19',
3619 def random_ipv4(cls, code_or_block):
3620 if len(code_or_block) == 2:
3621 block = cls._country_ip_map.get(code_or_block.upper())
3625 block = code_or_block
3626 addr, preflen = block.split('/')
3627 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3628 addr_max = addr_min | (0xffffffff >> int(preflen))
3629 return compat_str(socket.inet_ntoa(
3630 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3633 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3634 def __init__(self, proxies=None):
3635 # Set default handlers
3636 for type in ('http', 'https'):
3637 setattr(self, '%s_open' % type,
3638 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3639 meth(r, proxy, type))
3640 compat_urllib_request.ProxyHandler.__init__(self, proxies)
3642 def proxy_open(self, req, proxy, type):
3643 req_proxy = req.headers.get('Ytdl-request-proxy')
3644 if req_proxy is not None:
3646 del req.headers['Ytdl-request-proxy']
3648 if proxy == '__noproxy__':
3649 return None # No Proxy
3650 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3651 req.add_header('Ytdl-socks-proxy', proxy)
3652 # youtube-dl's http/https handlers do wrapping the socket with socks
3654 return compat_urllib_request.ProxyHandler.proxy_open(
3655 self, req, proxy, type)
3658 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3659 # released into Public Domain
3660 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3662 def long_to_bytes(n, blocksize=0):
3663 """long_to_bytes(n:long, blocksize:int) : string
3664 Convert a long integer to a byte string.
3666 If optional blocksize is given and greater than zero, pad the front of the
3667 byte string with binary zeros so that the length is a multiple of
3670 # after much testing, this algorithm was deemed to be the fastest
3674 s = compat_struct_pack('>I', n & 0xffffffff) + s
3676 # strip off leading zeros
3677 for i in range(len(s)):
3678 if s[i] != b'\000'[0]:
3681 # only happens when n == 0
3685 # add back some pad bytes. this could be done more efficiently w.r.t. the
3686 # de-padding being done above, but sigh...
3687 if blocksize > 0 and len(s) % blocksize:
3688 s = (blocksize - len(s) % blocksize) * b'\000' + s
3692 def bytes_to_long(s):
3693 """bytes_to_long(string) : long
3694 Convert a byte string to a long integer.
3696 This is (essentially) the inverse of long_to_bytes().
3701 extra = (4 - length % 4)
3702 s = b'\000' * extra + s
3703 length = length + extra
3704 for i in range(0, length, 4):
3705 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3709 def ohdave_rsa_encrypt(data, exponent, modulus):
3711 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3714 data: data to encrypt, bytes-like object
3715 exponent, modulus: parameter e and N of RSA algorithm, both integer
3716 Output: hex string of encrypted data
3718 Limitation: supports one block encryption only
3721 payload = int(binascii.hexlify(data[::-1]), 16)
3722 encrypted = pow(payload, exponent, modulus)
3723 return '%x' % encrypted
3726 def pkcs1pad(data, length):
3728 Padding input data with PKCS#1 scheme
3730 @param {int[]} data input data
3731 @param {int} length target length
3732 @returns {int[]} padded data
3734 if len(data) > length - 11:
3735 raise ValueError('Input data too long for PKCS#1 padding')
3737 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3738 return [0, 2] + pseudo_random + [0] + data
3741 def encode_base_n(num, n, table=None):
3742 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3744 table = FULL_TABLE[:n]
3747 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3754 ret = table[num % n] + ret
3759 def decode_packed_codes(code):
3760 mobj = re.search(PACKED_CODES_RE, code)
3761 obfucasted_code, base, count, symbols = mobj.groups()
3764 symbols = symbols.split('|')
3769 base_n_count = encode_base_n(count, base)
3770 symbol_table[base_n_count] = symbols[count] or base_n_count
3773 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3777 def parse_m3u8_attributes(attrib):
3779 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3780 if val.startswith('"'):
3786 def urshift(val, n):
3787 return val >> n if val >= 0 else (val + 0x100000000) >> n
3790 # Based on png2str() written by @gdkchan and improved by @yokrysty
3791 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3792 def decode_png(png_data):
3793 # Reference: https://www.w3.org/TR/PNG/
3794 header = png_data[8:]
3796 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3797 raise IOError('Not a valid PNG file.')
3799 int_map = {1: '>B', 2: '>H', 4: '>I'}
3800 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3805 length = unpack_integer(header[:4])
3808 chunk_type = header[:4]
3811 chunk_data = header[:length]
3812 header = header[length:]
3814 header = header[4:] # Skip CRC
3822 ihdr = chunks[0]['data']
3824 width = unpack_integer(ihdr[:4])
3825 height = unpack_integer(ihdr[4:8])
3829 for chunk in chunks:
3830 if chunk['type'] == b'IDAT':
3831 idat += chunk['data']
3834 raise IOError('Unable to read PNG data.')
3836 decompressed_data = bytearray(zlib.decompress(idat))
3841 def _get_pixel(idx):
3846 for y in range(height):
3847 basePos = y * (1 + stride)
3848 filter_type = decompressed_data[basePos]
3852 pixels.append(current_row)
3854 for x in range(stride):
3855 color = decompressed_data[1 + basePos + x]
3856 basex = y * stride + x
3861 left = _get_pixel(basex - 3)
3863 up = _get_pixel(basex - stride)
3865 if filter_type == 1: # Sub
3866 color = (color + left) & 0xff
3867 elif filter_type == 2: # Up
3868 color = (color + up) & 0xff
3869 elif filter_type == 3: # Average
3870 color = (color + ((left + up) >> 1)) & 0xff
3871 elif filter_type == 4: # Paeth
3877 c = _get_pixel(basex - stride - 3)
3885 if pa <= pb and pa <= pc:
3886 color = (color + a) & 0xff
3888 color = (color + b) & 0xff
3890 color = (color + c) & 0xff
3892 current_row.append(color)
3894 return width, height, pixels
3897 def write_xattr(path, key, value):
3898 # This mess below finds the best xattr tool for the job
3900 # try the pyxattr module...
3903 if hasattr(xattr, 'set'): # pyxattr
3904 # Unicode arguments are not supported in python-pyxattr until
3906 # See https://github.com/rg3/youtube-dl/issues/5498
3907 pyxattr_required_version = '0.5.0'
3908 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3909 # TODO: fallback to CLI tools
3910 raise XAttrUnavailableError(
3911 'python-pyxattr is detected but is too old. '
3912 'youtube-dl requires %s or above while your version is %s. '
3913 'Falling back to other xattr implementations' % (
3914 pyxattr_required_version, xattr.__version__))
3916 setxattr = xattr.set
3918 setxattr = xattr.setxattr
3921 setxattr(path, key, value)
3922 except EnvironmentError as e:
3923 raise XAttrMetadataError(e.errno, e.strerror)
3926 if compat_os_name == 'nt':
3927 # Write xattrs to NTFS Alternate Data Streams:
3928 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3929 assert ':' not in key
3930 assert os.path.exists(path)
3932 ads_fn = path + ':' + key
3934 with open(ads_fn, 'wb') as f:
3936 except EnvironmentError as e:
3937 raise XAttrMetadataError(e.errno, e.strerror)
3939 user_has_setfattr = check_executable('setfattr', ['--version'])
3940 user_has_xattr = check_executable('xattr', ['-h'])
3942 if user_has_setfattr or user_has_xattr:
3944 value = value.decode('utf-8')
3945 if user_has_setfattr:
3946 executable = 'setfattr'
3947 opts = ['-n', key, '-v', value]
3948 elif user_has_xattr:
3949 executable = 'xattr'
3950 opts = ['-w', key, value]
3952 cmd = ([encodeFilename(executable, True)] +
3953 [encodeArgument(o) for o in opts] +
3954 [encodeFilename(path, True)])
3957 p = subprocess.Popen(
3958 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3959 except EnvironmentError as e:
3960 raise XAttrMetadataError(e.errno, e.strerror)
3961 stdout, stderr = p.communicate()
3962 stderr = stderr.decode('utf-8', 'replace')
3963 if p.returncode != 0:
3964 raise XAttrMetadataError(p.returncode, stderr)
3967 # On Unix, and can't find pyxattr, setfattr, or xattr.
3968 if sys.platform.startswith('linux'):
3969 raise XAttrUnavailableError(
3970 "Couldn't find a tool to set the xattrs. "
3971 "Install either the python 'pyxattr' or 'xattr' "
3972 "modules, or the GNU 'attr' package "
3973 "(which contains the 'setfattr' tool).")
3975 raise XAttrUnavailableError(
3976 "Couldn't find a tool to set the xattrs. "
3977 "Install either the python 'xattr' module, "
3978 "or the 'xattr' binary.")
3981 def random_birthday(year_field, month_field, day_field):
3982 start_date = datetime.date(1950, 1, 1)
3983 end_date = datetime.date(1995, 12, 31)
3984 offset = random.randint(0, (end_date - start_date).days)
3985 random_date = start_date + datetime.timedelta(offset)
3987 year_field: str(random_date.year),
3988 month_field: str(random_date.month),
3989 day_field: str(random_date.day),