2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
45 compat_socket_create_connection,
49 compat_urllib_parse_urlencode,
50 compat_urllib_parse_urlparse,
51 compat_urllib_request,
64 def register_socks_protocols():
65 # "Register" SOCKS protocols
66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
67 if scheme not in compat_urlparse.uses_netloc:
68 compat_urlparse.uses_netloc.append(scheme)
71 # This is not clearly defined otherwise
72 compiled_regex_type = type(re.compile(''))
75 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
76 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
77 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
78 'Accept-Encoding': 'gzip, deflate',
79 'Accept-Language': 'en-us,en;q=0.5',
85 ENGLISH_MONTH_NAMES = [
86 'January', 'February', 'March', 'April', 'May', 'June',
87 'July', 'August', 'September', 'October', 'November', 'December']
90 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
91 'flv', 'f4v', 'f4a', 'f4b',
92 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
102 'f4f', 'f4m', 'm3u8', 'smil')
104 # needed for sanitizing filenames in restricted mode
105 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
106 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
107 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
110 def preferredencoding():
111 """Get preferred encoding.
113 Returns the best encoding scheme for the system, based on
114 locale.getpreferredencoding() and some further tweaks.
117 pref = locale.getpreferredencoding()
125 def write_json_file(obj, fn):
126 """ Encode obj as JSON and write it to fn, atomically if possible """
128 fn = encodeFilename(fn)
129 if sys.version_info < (3, 0) and sys.platform != 'win32':
130 encoding = get_filesystem_encoding()
131 # os.path.basename returns a bytes object, but NamedTemporaryFile
132 # will fail if the filename contains non ascii characters unless we
133 # use a unicode object
134 path_basename = lambda f: os.path.basename(fn).decode(encoding)
135 # the same for os.path.dirname
136 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
138 path_basename = os.path.basename
139 path_dirname = os.path.dirname
143 'prefix': path_basename(fn) + '.',
144 'dir': path_dirname(fn),
148 # In Python 2.x, json.dump expects a bytestream.
149 # In Python 3.x, it writes to a character stream
150 if sys.version_info < (3, 0):
158 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
163 if sys.platform == 'win32':
164 # Need to remove existing file on Windows, else os.rename raises
165 # WindowsError or FileExistsError.
170 os.rename(tf.name, fn)
179 if sys.version_info >= (2, 7):
180 def find_xpath_attr(node, xpath, key, val=None):
181 """ Find the xpath xpath[@key=val] """
182 assert re.match(r'^[a-zA-Z_-]+$', key)
183 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
184 return node.find(expr)
186 def find_xpath_attr(node, xpath, key, val=None):
187 for f in node.findall(compat_xpath(xpath)):
188 if key not in f.attrib:
190 if val is None or f.attrib.get(key) == val:
194 # On python2.6 the xml.etree.ElementTree.Element methods don't support
195 # the namespace parameter
198 def xpath_with_ns(path, ns_map):
199 components = [c.split(':') for c in path.split('/')]
203 replaced.append(c[0])
206 replaced.append('{%s}%s' % (ns_map[ns], tag))
207 return '/'.join(replaced)
210 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
211 def _find_xpath(xpath):
212 return node.find(compat_xpath(xpath))
214 if isinstance(xpath, (str, compat_str)):
215 n = _find_xpath(xpath)
223 if default is not NO_DEFAULT:
226 name = xpath if name is None else name
227 raise ExtractorError('Could not find XML element %s' % name)
233 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
234 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
235 if n is None or n == default:
238 if default is not NO_DEFAULT:
241 name = xpath if name is None else name
242 raise ExtractorError('Could not find XML element\'s text %s' % name)
248 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
249 n = find_xpath_attr(node, xpath, key)
251 if default is not NO_DEFAULT:
254 name = '%s[@%s]' % (xpath, key) if name is None else name
255 raise ExtractorError('Could not find XML attribute %s' % name)
261 def get_element_by_id(id, html):
262 """Return the content of the tag with the specified ID in the passed HTML document"""
263 return get_element_by_attribute('id', id, html)
266 def get_element_by_attribute(attribute, value, html):
267 """Return the content of the tag with the specified attribute in the passed HTML document"""
269 m = re.search(r'''(?xs)
271 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
273 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
277 ''' % (re.escape(attribute), re.escape(value)), html)
281 res = m.group('content')
283 if res.startswith('"') or res.startswith("'"):
286 return unescapeHTML(res)
289 class HTMLAttributeParser(compat_HTMLParser):
290 """Trivial HTML parser to gather the attributes for a single element"""
293 compat_HTMLParser.__init__(self)
295 def handle_starttag(self, tag, attrs):
296 self.attrs = dict(attrs)
299 def extract_attributes(html_element):
300 """Given a string for an HTML element such as
302 a="foo" B="bar" c="&98;az" d=boz
303 empty= noval entity="&"
306 Decode and return a dictionary of attributes.
308 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
309 'empty': '', 'noval': None, 'entity': '&',
310 'sq': '"', 'dq': '\''
312 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
313 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
315 parser = HTMLAttributeParser()
316 parser.feed(html_element)
321 def clean_html(html):
322 """Clean an HTML snippet into a readable string"""
324 if html is None: # Convenience for sanitizing descriptions etc.
328 html = html.replace('\n', ' ')
329 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
330 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
332 html = re.sub('<.*?>', '', html)
333 # Replace html entities
334 html = unescapeHTML(html)
338 def sanitize_open(filename, open_mode):
339 """Try to open the given filename, and slightly tweak it if this fails.
341 Attempts to open the given filename. If this fails, it tries to change
342 the filename slightly, step by step, until it's either able to open it
343 or it fails and raises a final exception, like the standard open()
346 It returns the tuple (stream, definitive_file_name).
350 if sys.platform == 'win32':
352 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
353 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
354 stream = open(encodeFilename(filename), open_mode)
355 return (stream, filename)
356 except (IOError, OSError) as err:
357 if err.errno in (errno.EACCES,):
360 # In case of error, try to remove win32 forbidden chars
361 alt_filename = sanitize_path(filename)
362 if alt_filename == filename:
365 # An exception here should be caught in the caller
366 stream = open(encodeFilename(alt_filename), open_mode)
367 return (stream, alt_filename)
370 def timeconvert(timestr):
371 """Convert RFC 2822 defined time string into system timestamp"""
373 timetuple = email.utils.parsedate_tz(timestr)
374 if timetuple is not None:
375 timestamp = email.utils.mktime_tz(timetuple)
379 def sanitize_filename(s, restricted=False, is_id=False):
380 """Sanitizes a string so it could be used as part of a filename.
381 If restricted is set, use a stricter subset of allowed characters.
382 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
384 def replace_insane(char):
385 if restricted and char in ACCENT_CHARS:
386 return ACCENT_CHARS[char]
387 if char == '?' or ord(char) < 32 or ord(char) == 127:
390 return '' if restricted else '\''
392 return '_-' if restricted else ' -'
393 elif char in '\\/|*<>':
395 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
397 if restricted and ord(char) > 127:
402 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
403 result = ''.join(map(replace_insane, s))
405 while '__' in result:
406 result = result.replace('__', '_')
407 result = result.strip('_')
408 # Common case of "Foreign band name - English song title"
409 if restricted and result.startswith('-_'):
411 if result.startswith('-'):
412 result = '_' + result[len('-'):]
413 result = result.lstrip('.')
419 def sanitize_path(s):
420 """Sanitizes and normalizes path on Windows"""
421 if sys.platform != 'win32':
423 drive_or_unc, _ = os.path.splitdrive(s)
424 if sys.version_info < (2, 7) and not drive_or_unc:
425 drive_or_unc, _ = os.path.splitunc(s)
426 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
430 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
431 for path_part in norm_path]
433 sanitized_path.insert(0, drive_or_unc + os.path.sep)
434 return os.path.join(*sanitized_path)
437 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
438 # unwanted failures due to missing protocol
439 def sanitize_url(url):
440 return 'http:%s' % url if url.startswith('//') else url
443 def sanitized_Request(url, *args, **kwargs):
444 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
447 def orderedSet(iterable):
448 """ Remove all duplicates from the input iterable """
456 def _htmlentity_transform(entity):
457 """Transforms an HTML entity to a character."""
458 # Known non-numeric HTML entity
459 if entity in compat_html_entities.name2codepoint:
460 return compat_chr(compat_html_entities.name2codepoint[entity])
462 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
464 numstr = mobj.group(1)
465 if numstr.startswith('x'):
467 numstr = '0%s' % numstr
470 # See https://github.com/rg3/youtube-dl/issues/7518
472 return compat_chr(int(numstr, base))
476 # Unknown entity in name, return its literal representation
477 return '&%s;' % entity
483 assert type(s) == compat_str
486 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
489 def get_subprocess_encoding():
490 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
491 # For subprocess calls, encode with locale encoding
492 # Refer to http://stackoverflow.com/a/9951851/35070
493 encoding = preferredencoding()
495 encoding = sys.getfilesystemencoding()
501 def encodeFilename(s, for_subprocess=False):
503 @param s The name of the file
506 assert type(s) == compat_str
508 # Python 3 has a Unicode API
509 if sys.version_info >= (3, 0):
512 # Pass '' directly to use Unicode APIs on Windows 2000 and up
513 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
514 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
515 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
518 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
519 if sys.platform.startswith('java'):
522 return s.encode(get_subprocess_encoding(), 'ignore')
525 def decodeFilename(b, for_subprocess=False):
527 if sys.version_info >= (3, 0):
530 if not isinstance(b, bytes):
533 return b.decode(get_subprocess_encoding(), 'ignore')
536 def encodeArgument(s):
537 if not isinstance(s, compat_str):
538 # Legacy code that uses byte strings
539 # Uncomment the following line after fixing all post processors
540 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
541 s = s.decode('ascii')
542 return encodeFilename(s, True)
545 def decodeArgument(b):
546 return decodeFilename(b, True)
549 def decodeOption(optval):
552 if isinstance(optval, bytes):
553 optval = optval.decode(preferredencoding())
555 assert isinstance(optval, compat_str)
559 def formatSeconds(secs):
561 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
563 return '%d:%02d' % (secs // 60, secs % 60)
568 def make_HTTPS_handler(params, **kwargs):
569 opts_no_check_certificate = params.get('nocheckcertificate', False)
570 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
571 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
572 if opts_no_check_certificate:
573 context.check_hostname = False
574 context.verify_mode = ssl.CERT_NONE
576 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
579 # (create_default_context present but HTTPSHandler has no context=)
582 if sys.version_info < (3, 2):
583 return YoutubeDLHTTPSHandler(params, **kwargs)
585 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
586 context.verify_mode = (ssl.CERT_NONE
587 if opts_no_check_certificate
588 else ssl.CERT_REQUIRED)
589 context.set_default_verify_paths()
590 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
593 def bug_reports_message():
594 if ytdl_is_updateable():
595 update_cmd = 'type youtube-dl -U to update'
597 update_cmd = 'see https://yt-dl.org/update on how to update'
598 msg = '; please report this issue on https://yt-dl.org/bug .'
599 msg += ' Make sure you are using the latest version; %s.' % update_cmd
600 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
604 class ExtractorError(Exception):
605 """Error during info extraction."""
607 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
608 """ tb, if given, is the original traceback (so that it can be printed out).
609 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
612 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
614 if video_id is not None:
615 msg = video_id + ': ' + msg
617 msg += ' (caused by %r)' % cause
619 msg += bug_reports_message()
620 super(ExtractorError, self).__init__(msg)
623 self.exc_info = sys.exc_info() # preserve original exception
625 self.video_id = video_id
627 def format_traceback(self):
628 if self.traceback is None:
630 return ''.join(traceback.format_tb(self.traceback))
633 class UnsupportedError(ExtractorError):
634 def __init__(self, url):
635 super(UnsupportedError, self).__init__(
636 'Unsupported URL: %s' % url, expected=True)
640 class RegexNotFoundError(ExtractorError):
641 """Error when a regex didn't match"""
645 class DownloadError(Exception):
646 """Download Error exception.
648 This exception may be thrown by FileDownloader objects if they are not
649 configured to continue on errors. They will contain the appropriate
653 def __init__(self, msg, exc_info=None):
654 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
655 super(DownloadError, self).__init__(msg)
656 self.exc_info = exc_info
659 class SameFileError(Exception):
660 """Same File exception.
662 This exception will be thrown by FileDownloader objects if they detect
663 multiple files would have to be downloaded to the same file on disk.
668 class PostProcessingError(Exception):
669 """Post Processing exception.
671 This exception may be raised by PostProcessor's .run() method to
672 indicate an error in the postprocessing task.
675 def __init__(self, msg):
679 class MaxDownloadsReached(Exception):
680 """ --max-downloads limit has been reached. """
684 class UnavailableVideoError(Exception):
685 """Unavailable Format exception.
687 This exception will be thrown when a video is requested
688 in a format that is not available for that video.
693 class ContentTooShortError(Exception):
694 """Content Too Short exception.
696 This exception may be raised by FileDownloader objects when a file they
697 download is too small for what the server announced first, indicating
698 the connection was probably interrupted.
701 def __init__(self, downloaded, expected):
703 self.downloaded = downloaded
704 self.expected = expected
707 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
708 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
709 # expected HTTP responses to meet HTTP/1.0 or later (see also
710 # https://github.com/rg3/youtube-dl/issues/6727)
711 if sys.version_info < (3, 0):
712 kwargs[b'strict'] = True
713 hc = http_class(*args, **kwargs)
714 source_address = ydl_handler._params.get('source_address')
715 if source_address is not None:
716 sa = (source_address, 0)
717 if hasattr(hc, 'source_address'): # Python 2.7+
718 hc.source_address = sa
720 def _hc_connect(self, *args, **kwargs):
721 sock = compat_socket_create_connection(
722 (self.host, self.port), self.timeout, sa)
724 self.sock = ssl.wrap_socket(
725 sock, self.key_file, self.cert_file,
726 ssl_version=ssl.PROTOCOL_TLSv1)
729 hc.connect = functools.partial(_hc_connect, hc)
734 def handle_youtubedl_headers(headers):
735 filtered_headers = headers
737 if 'Youtubedl-no-compression' in filtered_headers:
738 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
739 del filtered_headers['Youtubedl-no-compression']
741 return filtered_headers
744 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
745 """Handler for HTTP requests and responses.
747 This class, when installed with an OpenerDirector, automatically adds
748 the standard headers to every HTTP request and handles gzipped and
749 deflated responses from web servers. If compression is to be avoided in
750 a particular request, the original request in the program code only has
751 to include the HTTP header "Youtubedl-no-compression", which will be
752 removed before making the real request.
754 Part of this code was copied from:
756 http://techknack.net/python-urllib2-handlers/
758 Andrew Rowls, the author of that code, agreed to release it to the
762 def __init__(self, params, *args, **kwargs):
763 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
764 self._params = params
766 def http_open(self, req):
767 conn_class = compat_http_client.HTTPConnection
769 socks_proxy = req.headers.get('Ytdl-socks-proxy')
771 conn_class = make_socks_conn_class(conn_class, socks_proxy)
772 del req.headers['Ytdl-socks-proxy']
774 return self.do_open(functools.partial(
775 _create_http_connection, self, conn_class, False),
781 return zlib.decompress(data, -zlib.MAX_WBITS)
783 return zlib.decompress(data)
786 def addinfourl_wrapper(stream, headers, url, code):
787 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
788 return compat_urllib_request.addinfourl(stream, headers, url, code)
789 ret = compat_urllib_request.addinfourl(stream, headers, url)
793 def http_request(self, req):
794 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
795 # always respected by websites, some tend to give out URLs with non percent-encoded
796 # non-ASCII characters (see telemb.py, ard.py [#3412])
797 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
798 # To work around aforementioned issue we will replace request's original URL with
799 # percent-encoded one
800 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
801 # the code of this workaround has been moved here from YoutubeDL.urlopen()
802 url = req.get_full_url()
803 url_escaped = escape_url(url)
805 # Substitute URL if any change after escaping
806 if url != url_escaped:
807 req = update_Request(req, url=url_escaped)
809 for h, v in std_headers.items():
810 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
811 # The dict keys are capitalized because of this bug by urllib
812 if h.capitalize() not in req.headers:
815 req.headers = handle_youtubedl_headers(req.headers)
817 if sys.version_info < (2, 7) and '#' in req.get_full_url():
818 # Python 2.6 is brain-dead when it comes to fragments
819 req._Request__original = req._Request__original.partition('#')[0]
820 req._Request__r_type = req._Request__r_type.partition('#')[0]
824 def http_response(self, req, resp):
827 if resp.headers.get('Content-encoding', '') == 'gzip':
828 content = resp.read()
829 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
831 uncompressed = io.BytesIO(gz.read())
832 except IOError as original_ioerror:
833 # There may be junk add the end of the file
834 # See http://stackoverflow.com/q/4928560/35070 for details
835 for i in range(1, 1024):
837 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
838 uncompressed = io.BytesIO(gz.read())
843 raise original_ioerror
844 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
845 resp.msg = old_resp.msg
846 del resp.headers['Content-encoding']
848 if resp.headers.get('Content-encoding', '') == 'deflate':
849 gz = io.BytesIO(self.deflate(resp.read()))
850 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
851 resp.msg = old_resp.msg
852 del resp.headers['Content-encoding']
853 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
854 # https://github.com/rg3/youtube-dl/issues/6457).
855 if 300 <= resp.code < 400:
856 location = resp.headers.get('Location')
858 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
859 if sys.version_info >= (3, 0):
860 location = location.encode('iso-8859-1').decode('utf-8')
861 location_escaped = escape_url(location)
862 if location != location_escaped:
863 del resp.headers['Location']
864 resp.headers['Location'] = location_escaped
867 https_request = http_request
868 https_response = http_response
871 def make_socks_conn_class(base_class, socks_proxy):
872 assert issubclass(base_class, (
873 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
875 url_components = compat_urlparse.urlparse(socks_proxy)
876 if url_components.scheme.lower() == 'socks5':
877 socks_type = ProxyType.SOCKS5
878 elif url_components.scheme.lower() in ('socks', 'socks4'):
879 socks_type = ProxyType.SOCKS4
880 elif url_components.scheme.lower() == 'socks4a':
881 socks_type = ProxyType.SOCKS4A
885 url_components.hostname, url_components.port or 1080,
887 url_components.username, url_components.password
890 class SocksConnection(base_class):
892 self.sock = sockssocket()
893 self.sock.setproxy(*proxy_args)
894 if type(self.timeout) in (int, float):
895 self.sock.settimeout(self.timeout)
896 self.sock.connect((self.host, self.port))
898 if isinstance(self, compat_http_client.HTTPSConnection):
899 if hasattr(self, '_context'): # Python > 2.6
900 self.sock = self._context.wrap_socket(
901 self.sock, server_hostname=self.host)
903 self.sock = ssl.wrap_socket(self.sock)
905 return SocksConnection
908 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
909 def __init__(self, params, https_conn_class=None, *args, **kwargs):
910 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
911 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
912 self._params = params
914 def https_open(self, req):
916 conn_class = self._https_conn_class
918 if hasattr(self, '_context'): # python > 2.6
919 kwargs['context'] = self._context
920 if hasattr(self, '_check_hostname'): # python 3.x
921 kwargs['check_hostname'] = self._check_hostname
923 socks_proxy = req.headers.get('Ytdl-socks-proxy')
925 conn_class = make_socks_conn_class(conn_class, socks_proxy)
926 del req.headers['Ytdl-socks-proxy']
928 return self.do_open(functools.partial(
929 _create_http_connection, self, conn_class, True),
933 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
934 def __init__(self, cookiejar=None):
935 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
937 def http_response(self, request, response):
938 # Python 2 will choke on next HTTP request in row if there are non-ASCII
939 # characters in Set-Cookie HTTP header of last response (see
940 # https://github.com/rg3/youtube-dl/issues/6769).
941 # In order to at least prevent crashing we will percent encode Set-Cookie
942 # header before HTTPCookieProcessor starts processing it.
943 # if sys.version_info < (3, 0) and response.headers:
944 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
945 # set_cookie = response.headers.get(set_cookie_header)
947 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
948 # if set_cookie != set_cookie_escaped:
949 # del response.headers[set_cookie_header]
950 # response.headers[set_cookie_header] = set_cookie_escaped
951 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
953 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
954 https_response = http_response
957 def parse_iso8601(date_str, delimiter='T', timezone=None):
958 """ Return a UNIX timestamp from the given date """
963 date_str = re.sub(r'\.[0-9]+', '', date_str)
967 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
970 timezone = datetime.timedelta()
972 date_str = date_str[:-len(m.group(0))]
973 if not m.group('sign'):
974 timezone = datetime.timedelta()
976 sign = 1 if m.group('sign') == '+' else -1
977 timezone = datetime.timedelta(
978 hours=sign * int(m.group('hours')),
979 minutes=sign * int(m.group('minutes')))
981 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
982 dt = datetime.datetime.strptime(date_str, date_format) - timezone
983 return calendar.timegm(dt.timetuple())
988 def unified_strdate(date_str, day_first=True):
989 """Return a string with the date in the format YYYYMMDD"""
995 date_str = date_str.replace(',', ' ')
996 # %z (UTC offset) is only supported in python>=3.2
997 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
998 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
999 # Remove AM/PM + timezone
1000 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1002 format_expressions = [
1013 '%Y/%m/%d %H:%M:%S',
1014 '%Y-%m-%d %H:%M:%S',
1015 '%Y-%m-%d %H:%M:%S.%f',
1018 '%Y-%m-%dT%H:%M:%SZ',
1019 '%Y-%m-%dT%H:%M:%S.%fZ',
1020 '%Y-%m-%dT%H:%M:%S.%f0Z',
1021 '%Y-%m-%dT%H:%M:%S',
1022 '%Y-%m-%dT%H:%M:%S.%f',
1026 format_expressions.extend([
1031 '%d/%m/%Y %H:%M:%S',
1034 format_expressions.extend([
1039 '%m/%d/%Y %H:%M:%S',
1041 for expression in format_expressions:
1043 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1046 if upload_date is None:
1047 timetuple = email.utils.parsedate_tz(date_str)
1049 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1050 if upload_date is not None:
1051 return compat_str(upload_date)
1054 def determine_ext(url, default_ext='unknown_video'):
1057 guess = url.partition('?')[0].rpartition('.')[2]
1058 if re.match(r'^[A-Za-z0-9]+$', guess):
1060 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1061 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1062 return guess.rstrip('/')
1067 def subtitles_filename(filename, sub_lang, sub_format):
1068 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1071 def date_from_str(date_str):
1073 Return a datetime object from a string in the format YYYYMMDD or
1074 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1075 today = datetime.date.today()
1076 if date_str in ('now', 'today'):
1078 if date_str == 'yesterday':
1079 return today - datetime.timedelta(days=1)
1080 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1081 if match is not None:
1082 sign = match.group('sign')
1083 time = int(match.group('time'))
1086 unit = match.group('unit')
1087 # A bad approximation?
1091 elif unit == 'year':
1095 delta = datetime.timedelta(**{unit: time})
1096 return today + delta
1097 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1100 def hyphenate_date(date_str):
1102 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1103 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1104 if match is not None:
1105 return '-'.join(match.groups())
1110 class DateRange(object):
1111 """Represents a time interval between two dates"""
1113 def __init__(self, start=None, end=None):
1114 """start and end must be strings in the format accepted by date"""
1115 if start is not None:
1116 self.start = date_from_str(start)
1118 self.start = datetime.datetime.min.date()
1120 self.end = date_from_str(end)
1122 self.end = datetime.datetime.max.date()
1123 if self.start > self.end:
1124 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1128 """Returns a range that only contains the given day"""
1129 return cls(day, day)
1131 def __contains__(self, date):
1132 """Check if the date is in the range"""
1133 if not isinstance(date, datetime.date):
1134 date = date_from_str(date)
1135 return self.start <= date <= self.end
1138 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1141 def platform_name():
1142 """ Returns the platform name as a compat_str """
1143 res = platform.platform()
1144 if isinstance(res, bytes):
1145 res = res.decode(preferredencoding())
1147 assert isinstance(res, compat_str)
1151 def _windows_write_string(s, out):
1152 """ Returns True if the string was written using special methods,
1153 False if it has yet to be written out."""
1154 # Adapted from http://stackoverflow.com/a/3259271/35070
1157 import ctypes.wintypes
1165 fileno = out.fileno()
1166 except AttributeError:
1167 # If the output stream doesn't have a fileno, it's virtual
1169 except io.UnsupportedOperation:
1170 # Some strange Windows pseudo files?
1172 if fileno not in WIN_OUTPUT_IDS:
1175 GetStdHandle = ctypes.WINFUNCTYPE(
1176 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1177 (b'GetStdHandle', ctypes.windll.kernel32))
1178 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1180 WriteConsoleW = ctypes.WINFUNCTYPE(
1181 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1182 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1183 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1184 written = ctypes.wintypes.DWORD(0)
1186 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1187 FILE_TYPE_CHAR = 0x0002
1188 FILE_TYPE_REMOTE = 0x8000
1189 GetConsoleMode = ctypes.WINFUNCTYPE(
1190 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1191 ctypes.POINTER(ctypes.wintypes.DWORD))(
1192 (b'GetConsoleMode', ctypes.windll.kernel32))
1193 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1195 def not_a_console(handle):
1196 if handle == INVALID_HANDLE_VALUE or handle is None:
1198 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1199 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1201 if not_a_console(h):
1204 def next_nonbmp_pos(s):
1206 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1207 except StopIteration:
1211 count = min(next_nonbmp_pos(s), 1024)
1213 ret = WriteConsoleW(
1214 h, s, count if count else 2, ctypes.byref(written), None)
1216 raise OSError('Failed to write string')
1217 if not count: # We just wrote a non-BMP character
1218 assert written.value == 2
1221 assert written.value > 0
1222 s = s[written.value:]
1226 def write_string(s, out=None, encoding=None):
1229 assert type(s) == compat_str
1231 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1232 if _windows_write_string(s, out):
1235 if ('b' in getattr(out, 'mode', '') or
1236 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1237 byt = s.encode(encoding or preferredencoding(), 'ignore')
1239 elif hasattr(out, 'buffer'):
1240 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1241 byt = s.encode(enc, 'ignore')
1242 out.buffer.write(byt)
1248 def bytes_to_intlist(bs):
1251 if isinstance(bs[0], int): # Python 3
1254 return [ord(c) for c in bs]
1257 def intlist_to_bytes(xs):
1260 return struct_pack('%dB' % len(xs), *xs)
1263 # Cross-platform file locking
1264 if sys.platform == 'win32':
1265 import ctypes.wintypes
1268 class OVERLAPPED(ctypes.Structure):
1270 ('Internal', ctypes.wintypes.LPVOID),
1271 ('InternalHigh', ctypes.wintypes.LPVOID),
1272 ('Offset', ctypes.wintypes.DWORD),
1273 ('OffsetHigh', ctypes.wintypes.DWORD),
1274 ('hEvent', ctypes.wintypes.HANDLE),
1277 kernel32 = ctypes.windll.kernel32
1278 LockFileEx = kernel32.LockFileEx
1279 LockFileEx.argtypes = [
1280 ctypes.wintypes.HANDLE, # hFile
1281 ctypes.wintypes.DWORD, # dwFlags
1282 ctypes.wintypes.DWORD, # dwReserved
1283 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1284 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1285 ctypes.POINTER(OVERLAPPED) # Overlapped
1287 LockFileEx.restype = ctypes.wintypes.BOOL
1288 UnlockFileEx = kernel32.UnlockFileEx
1289 UnlockFileEx.argtypes = [
1290 ctypes.wintypes.HANDLE, # hFile
1291 ctypes.wintypes.DWORD, # dwReserved
1292 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1293 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1294 ctypes.POINTER(OVERLAPPED) # Overlapped
1296 UnlockFileEx.restype = ctypes.wintypes.BOOL
1297 whole_low = 0xffffffff
1298 whole_high = 0x7fffffff
1300 def _lock_file(f, exclusive):
1301 overlapped = OVERLAPPED()
1302 overlapped.Offset = 0
1303 overlapped.OffsetHigh = 0
1304 overlapped.hEvent = 0
1305 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1306 handle = msvcrt.get_osfhandle(f.fileno())
1307 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1308 whole_low, whole_high, f._lock_file_overlapped_p):
1309 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1311 def _unlock_file(f):
1312 assert f._lock_file_overlapped_p
1313 handle = msvcrt.get_osfhandle(f.fileno())
1314 if not UnlockFileEx(handle, 0,
1315 whole_low, whole_high, f._lock_file_overlapped_p):
1316 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1319 # Some platforms, such as Jython, is missing fcntl
1323 def _lock_file(f, exclusive):
1324 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1326 def _unlock_file(f):
1327 fcntl.flock(f, fcntl.LOCK_UN)
1329 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1331 def _lock_file(f, exclusive):
1332 raise IOError(UNSUPPORTED_MSG)
1334 def _unlock_file(f):
1335 raise IOError(UNSUPPORTED_MSG)
1338 class locked_file(object):
1339 def __init__(self, filename, mode, encoding=None):
1340 assert mode in ['r', 'a', 'w']
1341 self.f = io.open(filename, mode, encoding=encoding)
1344 def __enter__(self):
1345 exclusive = self.mode != 'r'
1347 _lock_file(self.f, exclusive)
1353 def __exit__(self, etype, value, traceback):
1355 _unlock_file(self.f)
1362 def write(self, *args):
1363 return self.f.write(*args)
1365 def read(self, *args):
1366 return self.f.read(*args)
1369 def get_filesystem_encoding():
1370 encoding = sys.getfilesystemencoding()
1371 return encoding if encoding is not None else 'utf-8'
1374 def shell_quote(args):
1376 encoding = get_filesystem_encoding()
1378 if isinstance(a, bytes):
1379 # We may get a filename encoded with 'encodeFilename'
1380 a = a.decode(encoding)
1381 quoted_args.append(pipes.quote(a))
1382 return ' '.join(quoted_args)
1385 def smuggle_url(url, data):
1386 """ Pass additional data in a URL for internal use. """
1388 sdata = compat_urllib_parse_urlencode(
1389 {'__youtubedl_smuggle': json.dumps(data)})
1390 return url + '#' + sdata
1393 def unsmuggle_url(smug_url, default=None):
1394 if '#__youtubedl_smuggle' not in smug_url:
1395 return smug_url, default
1396 url, _, sdata = smug_url.rpartition('#')
1397 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1398 data = json.loads(jsond)
1402 def format_bytes(bytes):
1405 if type(bytes) is str:
1406 bytes = float(bytes)
1410 exponent = int(math.log(bytes, 1024.0))
1411 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1412 converted = float(bytes) / float(1024 ** exponent)
1413 return '%.2f%s' % (converted, suffix)
1416 def lookup_unit_table(unit_table, s):
1417 units_re = '|'.join(re.escape(u) for u in unit_table)
1419 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1422 num_str = m.group('num').replace(',', '.')
1423 mult = unit_table[m.group('unit')]
1424 return int(float(num_str) * mult)
1427 def parse_filesize(s):
1431 # The lower-case forms are of course incorrect and unofficial,
1432 # but we support those too
1470 return lookup_unit_table(_UNIT_TABLE, s)
1479 if re.match(r'^[\d,.]+$', s):
1480 return str_to_int(s)
1491 return lookup_unit_table(_UNIT_TABLE, s)
1494 def month_by_name(name):
1495 """ Return the number of a month by (locale-independently) English name """
1498 return ENGLISH_MONTH_NAMES.index(name) + 1
1503 def month_by_abbreviation(abbrev):
1504 """ Return the number of a month by (locale-independently) English
1508 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1513 def fix_xml_ampersands(xml_str):
1514 """Replace all the '&' by '&' in XML"""
1516 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1521 def setproctitle(title):
1522 assert isinstance(title, compat_str)
1524 # ctypes in Jython is not complete
1525 # http://bugs.jython.org/issue2148
1526 if sys.platform.startswith('java'):
1530 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1533 title_bytes = title.encode('utf-8')
1534 buf = ctypes.create_string_buffer(len(title_bytes))
1535 buf.value = title_bytes
1537 libc.prctl(15, buf, 0, 0, 0)
1538 except AttributeError:
1539 return # Strange libc, just skip this
1542 def remove_start(s, start):
1543 if s.startswith(start):
1544 return s[len(start):]
1548 def remove_end(s, end):
1550 return s[:-len(end)]
1554 def remove_quotes(s):
1555 if s is None or len(s) < 2:
1557 for quote in ('"', "'", ):
1558 if s[0] == quote and s[-1] == quote:
1563 def url_basename(url):
1564 path = compat_urlparse.urlparse(url).path
1565 return path.strip('/').split('/')[-1]
1568 class HEADRequest(compat_urllib_request.Request):
1569 def get_method(self):
1573 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1576 v = getattr(v, get_attr, None)
1582 return int(v) * invscale // scale
1587 def str_or_none(v, default=None):
1588 return default if v is None else compat_str(v)
1591 def str_to_int(int_str):
1592 """ A more relaxed version of int_or_none """
1595 int_str = re.sub(r'[,\.\+]', '', int_str)
1599 def float_or_none(v, scale=1, invscale=1, default=None):
1603 return float(v) * invscale / scale
1608 def parse_duration(s):
1609 if not isinstance(s, compat_basestring):
1614 days, hours, mins, secs, ms = [None] * 5
1615 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1617 days, hours, mins, secs, ms = m.groups()
1622 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1625 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1628 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1631 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1634 days, hours, mins, secs, ms = m.groups()
1636 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1638 hours, mins = m.groups()
1644 duration += float(secs)
1646 duration += float(mins) * 60
1648 duration += float(hours) * 60 * 60
1650 duration += float(days) * 24 * 60 * 60
1652 duration += float(ms)
1656 def prepend_extension(filename, ext, expected_real_ext=None):
1657 name, real_ext = os.path.splitext(filename)
1659 '{0}.{1}{2}'.format(name, ext, real_ext)
1660 if not expected_real_ext or real_ext[1:] == expected_real_ext
1661 else '{0}.{1}'.format(filename, ext))
1664 def replace_extension(filename, ext, expected_real_ext=None):
1665 name, real_ext = os.path.splitext(filename)
1666 return '{0}.{1}'.format(
1667 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1671 def check_executable(exe, args=[]):
1672 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1673 args can be a list of arguments for a short output (like -version) """
1675 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1681 def get_exe_version(exe, args=['--version'],
1682 version_re=None, unrecognized='present'):
1683 """ Returns the version of the specified executable,
1684 or False if the executable is not present """
1686 out, _ = subprocess.Popen(
1687 [encodeArgument(exe)] + args,
1688 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1691 if isinstance(out, bytes): # Python 2.x
1692 out = out.decode('ascii', 'ignore')
1693 return detect_exe_version(out, version_re, unrecognized)
1696 def detect_exe_version(output, version_re=None, unrecognized='present'):
1697 assert isinstance(output, compat_str)
1698 if version_re is None:
1699 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1700 m = re.search(version_re, output)
1707 class PagedList(object):
1709 # This is only useful for tests
1710 return len(self.getslice())
1713 class OnDemandPagedList(PagedList):
1714 def __init__(self, pagefunc, pagesize, use_cache=False):
1715 self._pagefunc = pagefunc
1716 self._pagesize = pagesize
1717 self._use_cache = use_cache
1721 def getslice(self, start=0, end=None):
1723 for pagenum in itertools.count(start // self._pagesize):
1724 firstid = pagenum * self._pagesize
1725 nextfirstid = pagenum * self._pagesize + self._pagesize
1726 if start >= nextfirstid:
1731 page_results = self._cache.get(pagenum)
1732 if page_results is None:
1733 page_results = list(self._pagefunc(pagenum))
1735 self._cache[pagenum] = page_results
1738 start % self._pagesize
1739 if firstid <= start < nextfirstid
1743 ((end - 1) % self._pagesize) + 1
1744 if (end is not None and firstid <= end <= nextfirstid)
1747 if startv != 0 or endv is not None:
1748 page_results = page_results[startv:endv]
1749 res.extend(page_results)
1751 # A little optimization - if current page is not "full", ie. does
1752 # not contain page_size videos then we can assume that this page
1753 # is the last one - there are no more ids on further pages -
1754 # i.e. no need to query again.
1755 if len(page_results) + startv < self._pagesize:
1758 # If we got the whole page, but the next page is not interesting,
1759 # break out early as well
1760 if end == nextfirstid:
1765 class InAdvancePagedList(PagedList):
1766 def __init__(self, pagefunc, pagecount, pagesize):
1767 self._pagefunc = pagefunc
1768 self._pagecount = pagecount
1769 self._pagesize = pagesize
1771 def getslice(self, start=0, end=None):
1773 start_page = start // self._pagesize
1775 self._pagecount if end is None else (end // self._pagesize + 1))
1776 skip_elems = start - start_page * self._pagesize
1777 only_more = None if end is None else end - start
1778 for pagenum in range(start_page, end_page):
1779 page = list(self._pagefunc(pagenum))
1781 page = page[skip_elems:]
1783 if only_more is not None:
1784 if len(page) < only_more:
1785 only_more -= len(page)
1787 page = page[:only_more]
1794 def uppercase_escape(s):
1795 unicode_escape = codecs.getdecoder('unicode_escape')
1797 r'\\U[0-9a-fA-F]{8}',
1798 lambda m: unicode_escape(m.group(0))[0],
1802 def lowercase_escape(s):
1803 unicode_escape = codecs.getdecoder('unicode_escape')
1805 r'\\u[0-9a-fA-F]{4}',
1806 lambda m: unicode_escape(m.group(0))[0],
1810 def escape_rfc3986(s):
1811 """Escape non-ASCII characters as suggested by RFC 3986"""
1812 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1813 s = s.encode('utf-8')
1814 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1817 def escape_url(url):
1818 """Escape URL as suggested by RFC 3986"""
1819 url_parsed = compat_urllib_parse_urlparse(url)
1820 return url_parsed._replace(
1821 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1822 path=escape_rfc3986(url_parsed.path),
1823 params=escape_rfc3986(url_parsed.params),
1824 query=escape_rfc3986(url_parsed.query),
1825 fragment=escape_rfc3986(url_parsed.fragment)
1829 def read_batch_urls(batch_fd):
1831 if not isinstance(url, compat_str):
1832 url = url.decode('utf-8', 'replace')
1833 BOM_UTF8 = '\xef\xbb\xbf'
1834 if url.startswith(BOM_UTF8):
1835 url = url[len(BOM_UTF8):]
1837 if url.startswith(('#', ';', ']')):
1841 with contextlib.closing(batch_fd) as fd:
1842 return [url for url in map(fixup, fd) if url]
1845 def urlencode_postdata(*args, **kargs):
1846 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1849 def update_url_query(url, query):
1852 parsed_url = compat_urlparse.urlparse(url)
1853 qs = compat_parse_qs(parsed_url.query)
1855 return compat_urlparse.urlunparse(parsed_url._replace(
1856 query=compat_urllib_parse_urlencode(qs, True)))
1859 def update_Request(req, url=None, data=None, headers={}, query={}):
1860 req_headers = req.headers.copy()
1861 req_headers.update(headers)
1862 req_data = data or req.data
1863 req_url = update_url_query(url or req.get_full_url(), query)
1864 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1866 req_url, data=req_data, headers=req_headers,
1867 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1868 if hasattr(req, 'timeout'):
1869 new_req.timeout = req.timeout
1873 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1874 if isinstance(key_or_keys, (list, tuple)):
1875 for key in key_or_keys:
1876 if key not in d or d[key] is None or skip_false_values and not d[key]:
1880 return d.get(key_or_keys, default)
1883 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1884 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1896 def parse_age_limit(s):
1899 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1900 return int(m.group('age')) if m else US_RATINGS.get(s)
1903 def strip_jsonp(code):
1905 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1908 def js_to_json(code):
1911 if v in ('true', 'false', 'null'):
1913 if v.startswith('"'):
1914 v = re.sub(r"\\'", "'", v[1:-1])
1915 elif v.startswith("'"):
1917 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1924 res = re.sub(r'''(?x)
1925 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1926 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1927 [a-zA-Z_][.a-zA-Z_0-9]*
1929 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1933 def qualities(quality_ids):
1934 """ Get a numeric quality value out of a list of possible values """
1937 return quality_ids.index(qid)
1943 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1946 def limit_length(s, length):
1947 """ Add ellipses to overly long strings """
1952 return s[:length - len(ELLIPSES)] + ELLIPSES
1956 def version_tuple(v):
1957 return tuple(int(e) for e in re.split(r'[-.]', v))
1960 def is_outdated_version(version, limit, assume_new=True):
1962 return not assume_new
1964 return version_tuple(version) < version_tuple(limit)
1966 return not assume_new
1969 def ytdl_is_updateable():
1970 """ Returns if youtube-dl can be updated with -U """
1971 from zipimport import zipimporter
1973 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1976 def args_to_str(args):
1977 # Get a short string representation for a subprocess command
1978 return ' '.join(shlex_quote(a) for a in args)
1981 def error_to_compat_str(err):
1983 # On python 2 error byte string must be decoded with proper
1984 # encoding rather than ascii
1985 if sys.version_info[0] < 3:
1986 err_str = err_str.decode(preferredencoding())
1990 def mimetype2ext(mt):
2000 _, _, res = mt.rpartition('/')
2004 'smptett+xml': 'tt',
2010 'x-mp4-fragmented': 'mp4',
2015 def urlhandle_detect_ext(url_handle):
2018 getheader = lambda h: url_handle.headers[h]
2019 except AttributeError: # Python < 3
2020 getheader = url_handle.info().getheader
2022 cd = getheader('Content-Disposition')
2024 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2026 e = determine_ext(m.group('filename'), default_ext=None)
2030 return mimetype2ext(getheader('Content-Type'))
2033 def encode_data_uri(data, mime_type):
2034 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2037 def age_restricted(content_limit, age_limit):
2038 """ Returns True iff the content should be blocked """
2040 if age_limit is None: # No limit set
2042 if content_limit is None:
2043 return False # Content available for everyone
2044 return age_limit < content_limit
2047 def is_html(first_bytes):
2048 """ Detect whether a file contains HTML by examining its first bytes. """
2051 (b'\xef\xbb\xbf', 'utf-8'),
2052 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2053 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2054 (b'\xff\xfe', 'utf-16-le'),
2055 (b'\xfe\xff', 'utf-16-be'),
2057 for bom, enc in BOMS:
2058 if first_bytes.startswith(bom):
2059 s = first_bytes[len(bom):].decode(enc, 'replace')
2062 s = first_bytes.decode('utf-8', 'replace')
2064 return re.match(r'^\s*<', s)
2067 def determine_protocol(info_dict):
2068 protocol = info_dict.get('protocol')
2069 if protocol is not None:
2072 url = info_dict['url']
2073 if url.startswith('rtmp'):
2075 elif url.startswith('mms'):
2077 elif url.startswith('rtsp'):
2080 ext = determine_ext(url)
2086 return compat_urllib_parse_urlparse(url).scheme
2089 def render_table(header_row, data):
2090 """ Render a list of rows, each as a list of values """
2091 table = [header_row] + data
2092 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2093 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2094 return '\n'.join(format_str % tuple(row) for row in table)
2097 def _match_one(filter_part, dct):
2098 COMPARISON_OPERATORS = {
2106 operator_rex = re.compile(r'''(?x)\s*
2108 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2110 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2111 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2114 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2115 m = operator_rex.search(filter_part)
2117 op = COMPARISON_OPERATORS[m.group('op')]
2118 if m.group('strval') is not None:
2119 if m.group('op') not in ('=', '!='):
2121 'Operator %s does not support string values!' % m.group('op'))
2122 comparison_value = m.group('strval')
2125 comparison_value = int(m.group('intval'))
2127 comparison_value = parse_filesize(m.group('intval'))
2128 if comparison_value is None:
2129 comparison_value = parse_filesize(m.group('intval') + 'B')
2130 if comparison_value is None:
2132 'Invalid integer value %r in filter part %r' % (
2133 m.group('intval'), filter_part))
2134 actual_value = dct.get(m.group('key'))
2135 if actual_value is None:
2136 return m.group('none_inclusive')
2137 return op(actual_value, comparison_value)
2140 '': lambda v: v is not None,
2141 '!': lambda v: v is None,
2143 operator_rex = re.compile(r'''(?x)\s*
2144 (?P<op>%s)\s*(?P<key>[a-z_]+)
2146 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2147 m = operator_rex.search(filter_part)
2149 op = UNARY_OPERATORS[m.group('op')]
2150 actual_value = dct.get(m.group('key'))
2151 return op(actual_value)
2153 raise ValueError('Invalid filter part %r' % filter_part)
2156 def match_str(filter_str, dct):
2157 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2160 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2163 def match_filter_func(filter_str):
2164 def _match_func(info_dict):
2165 if match_str(filter_str, info_dict):
2168 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2169 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2173 def parse_dfxp_time_expr(time_expr):
2177 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2179 return float(mobj.group('time_offset'))
2181 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2183 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2186 def srt_subtitles_timecode(seconds):
2187 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2190 def dfxp2srt(dfxp_data):
2191 _x = functools.partial(xpath_with_ns, ns_map={
2192 'ttml': 'http://www.w3.org/ns/ttml',
2193 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2194 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2197 class TTMLPElementParser(object):
2200 def start(self, tag, attrib):
2201 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2207 def data(self, data):
2211 return self.out.strip()
2213 def parse_node(node):
2214 target = TTMLPElementParser()
2215 parser = xml.etree.ElementTree.XMLParser(target=target)
2216 parser.feed(xml.etree.ElementTree.tostring(node))
2217 return parser.close()
2219 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2221 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2224 raise ValueError('Invalid dfxp/TTML subtitle')
2226 for para, index in zip(paras, itertools.count(1)):
2227 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2228 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2229 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2230 if begin_time is None:
2235 end_time = begin_time + dur
2236 out.append('%d\n%s --> %s\n%s\n\n' % (
2238 srt_subtitles_timecode(begin_time),
2239 srt_subtitles_timecode(end_time),
2245 def cli_option(params, command_option, param):
2246 param = params.get(param)
2247 return [command_option, param] if param is not None else []
2250 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2251 param = params.get(param)
2252 assert isinstance(param, bool)
2254 return [command_option + separator + (true_value if param else false_value)]
2255 return [command_option, true_value if param else false_value]
2258 def cli_valueless_option(params, command_option, param, expected_value=True):
2259 param = params.get(param)
2260 return [command_option] if param == expected_value else []
2263 def cli_configuration_args(params, param, default=[]):
2264 ex_args = params.get(param)
2267 assert isinstance(ex_args, list)
2271 class ISO639Utils(object):
2272 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2461 def short2long(cls, code):
2462 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2463 return cls._lang_map.get(code[:2])
2466 def long2short(cls, code):
2467 """Convert language code from ISO 639-2/T to ISO 639-1"""
2468 for short_name, long_name in cls._lang_map.items():
2469 if long_name == code:
2473 class ISO3166Utils(object):
2474 # From http://data.okfn.org/data/core/country-list
2476 'AF': 'Afghanistan',
2477 'AX': 'Åland Islands',
2480 'AS': 'American Samoa',
2485 'AG': 'Antigua and Barbuda',
2502 'BO': 'Bolivia, Plurinational State of',
2503 'BQ': 'Bonaire, Sint Eustatius and Saba',
2504 'BA': 'Bosnia and Herzegovina',
2506 'BV': 'Bouvet Island',
2508 'IO': 'British Indian Ocean Territory',
2509 'BN': 'Brunei Darussalam',
2511 'BF': 'Burkina Faso',
2517 'KY': 'Cayman Islands',
2518 'CF': 'Central African Republic',
2522 'CX': 'Christmas Island',
2523 'CC': 'Cocos (Keeling) Islands',
2527 'CD': 'Congo, the Democratic Republic of the',
2528 'CK': 'Cook Islands',
2530 'CI': 'Côte d\'Ivoire',
2535 'CZ': 'Czech Republic',
2539 'DO': 'Dominican Republic',
2542 'SV': 'El Salvador',
2543 'GQ': 'Equatorial Guinea',
2547 'FK': 'Falkland Islands (Malvinas)',
2548 'FO': 'Faroe Islands',
2552 'GF': 'French Guiana',
2553 'PF': 'French Polynesia',
2554 'TF': 'French Southern Territories',
2569 'GW': 'Guinea-Bissau',
2572 'HM': 'Heard Island and McDonald Islands',
2573 'VA': 'Holy See (Vatican City State)',
2580 'IR': 'Iran, Islamic Republic of',
2583 'IM': 'Isle of Man',
2593 'KP': 'Korea, Democratic People\'s Republic of',
2594 'KR': 'Korea, Republic of',
2597 'LA': 'Lao People\'s Democratic Republic',
2603 'LI': 'Liechtenstein',
2607 'MK': 'Macedonia, the Former Yugoslav Republic of',
2614 'MH': 'Marshall Islands',
2620 'FM': 'Micronesia, Federated States of',
2621 'MD': 'Moldova, Republic of',
2632 'NL': 'Netherlands',
2633 'NC': 'New Caledonia',
2634 'NZ': 'New Zealand',
2639 'NF': 'Norfolk Island',
2640 'MP': 'Northern Mariana Islands',
2645 'PS': 'Palestine, State of',
2647 'PG': 'Papua New Guinea',
2650 'PH': 'Philippines',
2654 'PR': 'Puerto Rico',
2658 'RU': 'Russian Federation',
2660 'BL': 'Saint Barthélemy',
2661 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2662 'KN': 'Saint Kitts and Nevis',
2663 'LC': 'Saint Lucia',
2664 'MF': 'Saint Martin (French part)',
2665 'PM': 'Saint Pierre and Miquelon',
2666 'VC': 'Saint Vincent and the Grenadines',
2669 'ST': 'Sao Tome and Principe',
2670 'SA': 'Saudi Arabia',
2674 'SL': 'Sierra Leone',
2676 'SX': 'Sint Maarten (Dutch part)',
2679 'SB': 'Solomon Islands',
2681 'ZA': 'South Africa',
2682 'GS': 'South Georgia and the South Sandwich Islands',
2683 'SS': 'South Sudan',
2688 'SJ': 'Svalbard and Jan Mayen',
2691 'CH': 'Switzerland',
2692 'SY': 'Syrian Arab Republic',
2693 'TW': 'Taiwan, Province of China',
2695 'TZ': 'Tanzania, United Republic of',
2697 'TL': 'Timor-Leste',
2701 'TT': 'Trinidad and Tobago',
2704 'TM': 'Turkmenistan',
2705 'TC': 'Turks and Caicos Islands',
2709 'AE': 'United Arab Emirates',
2710 'GB': 'United Kingdom',
2711 'US': 'United States',
2712 'UM': 'United States Minor Outlying Islands',
2716 'VE': 'Venezuela, Bolivarian Republic of',
2718 'VG': 'Virgin Islands, British',
2719 'VI': 'Virgin Islands, U.S.',
2720 'WF': 'Wallis and Futuna',
2721 'EH': 'Western Sahara',
2728 def short2full(cls, code):
2729 """Convert an ISO 3166-2 country code to the corresponding full name"""
2730 return cls._country_map.get(code.upper())
2733 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2734 def __init__(self, proxies=None):
2735 # Set default handlers
2736 for type in ('http', 'https'):
2737 setattr(self, '%s_open' % type,
2738 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2739 meth(r, proxy, type))
2740 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2742 def proxy_open(self, req, proxy, type):
2743 req_proxy = req.headers.get('Ytdl-request-proxy')
2744 if req_proxy is not None:
2746 del req.headers['Ytdl-request-proxy']
2748 if proxy == '__noproxy__':
2749 return None # No Proxy
2750 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2751 req.add_header('Ytdl-socks-proxy', proxy)
2752 # youtube-dl's http/https handlers do wrapping the socket with socks
2754 return compat_urllib_request.ProxyHandler.proxy_open(
2755 self, req, proxy, type)
2758 def ohdave_rsa_encrypt(data, exponent, modulus):
2760 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2763 data: data to encrypt, bytes-like object
2764 exponent, modulus: parameter e and N of RSA algorithm, both integer
2765 Output: hex string of encrypted data
2767 Limitation: supports one block encryption only
2770 payload = int(binascii.hexlify(data[::-1]), 16)
2771 encrypted = pow(payload, exponent, modulus)
2772 return '%x' % encrypted
2775 def encode_base_n(num, n, table=None):
2776 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2778 table = FULL_TABLE[:n]
2781 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2788 ret = table[num % n] + ret
2793 def decode_packed_codes(code):
2795 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2797 obfucasted_code, base, count, symbols = mobj.groups()
2800 symbols = symbols.split('|')
2805 base_n_count = encode_base_n(count, base)
2806 symbol_table[base_n_count] = symbols[count] or base_n_count
2809 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],