2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
46 compat_socket_create_connection,
51 compat_urllib_parse_urlencode,
52 compat_urllib_parse_urlparse,
53 compat_urllib_request,
64 def register_socks_protocols():
65 # "Register" SOCKS protocols
66 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
67 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
68 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
69 if scheme not in compat_urlparse.uses_netloc:
70 compat_urlparse.uses_netloc.append(scheme)
73 # This is not clearly defined otherwise
74 compiled_regex_type = type(re.compile(''))
77 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
78 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
79 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
80 'Accept-Encoding': 'gzip, deflate',
81 'Accept-Language': 'en-us,en;q=0.5',
87 ENGLISH_MONTH_NAMES = [
88 'January', 'February', 'March', 'April', 'May', 'June',
89 'July', 'August', 'September', 'October', 'November', 'December']
92 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
93 'flv', 'f4v', 'f4a', 'f4b',
94 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
104 'f4f', 'f4m', 'm3u8', 'smil')
106 # needed for sanitizing filenames in restricted mode
107 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
108 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
109 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
112 def preferredencoding():
113 """Get preferred encoding.
115 Returns the best encoding scheme for the system, based on
116 locale.getpreferredencoding() and some further tweaks.
119 pref = locale.getpreferredencoding()
127 def write_json_file(obj, fn):
128 """ Encode obj as JSON and write it to fn, atomically if possible """
130 fn = encodeFilename(fn)
131 if sys.version_info < (3, 0) and sys.platform != 'win32':
132 encoding = get_filesystem_encoding()
133 # os.path.basename returns a bytes object, but NamedTemporaryFile
134 # will fail if the filename contains non ascii characters unless we
135 # use a unicode object
136 path_basename = lambda f: os.path.basename(fn).decode(encoding)
137 # the same for os.path.dirname
138 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
140 path_basename = os.path.basename
141 path_dirname = os.path.dirname
145 'prefix': path_basename(fn) + '.',
146 'dir': path_dirname(fn),
150 # In Python 2.x, json.dump expects a bytestream.
151 # In Python 3.x, it writes to a character stream
152 if sys.version_info < (3, 0):
160 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
165 if sys.platform == 'win32':
166 # Need to remove existing file on Windows, else os.rename raises
167 # WindowsError or FileExistsError.
172 os.rename(tf.name, fn)
181 if sys.version_info >= (2, 7):
182 def find_xpath_attr(node, xpath, key, val=None):
183 """ Find the xpath xpath[@key=val] """
184 assert re.match(r'^[a-zA-Z_-]+$', key)
185 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
186 return node.find(expr)
188 def find_xpath_attr(node, xpath, key, val=None):
189 for f in node.findall(compat_xpath(xpath)):
190 if key not in f.attrib:
192 if val is None or f.attrib.get(key) == val:
196 # On python2.6 the xml.etree.ElementTree.Element methods don't support
197 # the namespace parameter
200 def xpath_with_ns(path, ns_map):
201 components = [c.split(':') for c in path.split('/')]
205 replaced.append(c[0])
208 replaced.append('{%s}%s' % (ns_map[ns], tag))
209 return '/'.join(replaced)
212 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
213 def _find_xpath(xpath):
214 return node.find(compat_xpath(xpath))
216 if isinstance(xpath, (str, compat_str)):
217 n = _find_xpath(xpath)
225 if default is not NO_DEFAULT:
228 name = xpath if name is None else name
229 raise ExtractorError('Could not find XML element %s' % name)
235 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
236 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
237 if n is None or n == default:
240 if default is not NO_DEFAULT:
243 name = xpath if name is None else name
244 raise ExtractorError('Could not find XML element\'s text %s' % name)
250 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
251 n = find_xpath_attr(node, xpath, key)
253 if default is not NO_DEFAULT:
256 name = '%s[@%s]' % (xpath, key) if name is None else name
257 raise ExtractorError('Could not find XML attribute %s' % name)
263 def get_element_by_id(id, html):
264 """Return the content of the tag with the specified ID in the passed HTML document"""
265 return get_element_by_attribute('id', id, html)
268 def get_element_by_attribute(attribute, value, html):
269 """Return the content of the tag with the specified attribute in the passed HTML document"""
271 m = re.search(r'''(?xs)
273 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
275 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
279 ''' % (re.escape(attribute), re.escape(value)), html)
283 res = m.group('content')
285 if res.startswith('"') or res.startswith("'"):
288 return unescapeHTML(res)
291 class HTMLAttributeParser(compat_HTMLParser):
292 """Trivial HTML parser to gather the attributes for a single element"""
295 compat_HTMLParser.__init__(self)
297 def handle_starttag(self, tag, attrs):
298 self.attrs = dict(attrs)
301 def extract_attributes(html_element):
302 """Given a string for an HTML element such as
304 a="foo" B="bar" c="&98;az" d=boz
305 empty= noval entity="&"
308 Decode and return a dictionary of attributes.
310 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
311 'empty': '', 'noval': None, 'entity': '&',
312 'sq': '"', 'dq': '\''
314 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
315 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
317 parser = HTMLAttributeParser()
318 parser.feed(html_element)
323 def clean_html(html):
324 """Clean an HTML snippet into a readable string"""
326 if html is None: # Convenience for sanitizing descriptions etc.
330 html = html.replace('\n', ' ')
331 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
332 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
334 html = re.sub('<.*?>', '', html)
335 # Replace html entities
336 html = unescapeHTML(html)
340 def sanitize_open(filename, open_mode):
341 """Try to open the given filename, and slightly tweak it if this fails.
343 Attempts to open the given filename. If this fails, it tries to change
344 the filename slightly, step by step, until it's either able to open it
345 or it fails and raises a final exception, like the standard open()
348 It returns the tuple (stream, definitive_file_name).
352 if sys.platform == 'win32':
354 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
355 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
356 stream = open(encodeFilename(filename), open_mode)
357 return (stream, filename)
358 except (IOError, OSError) as err:
359 if err.errno in (errno.EACCES,):
362 # In case of error, try to remove win32 forbidden chars
363 alt_filename = sanitize_path(filename)
364 if alt_filename == filename:
367 # An exception here should be caught in the caller
368 stream = open(encodeFilename(alt_filename), open_mode)
369 return (stream, alt_filename)
372 def timeconvert(timestr):
373 """Convert RFC 2822 defined time string into system timestamp"""
375 timetuple = email.utils.parsedate_tz(timestr)
376 if timetuple is not None:
377 timestamp = email.utils.mktime_tz(timetuple)
381 def sanitize_filename(s, restricted=False, is_id=False):
382 """Sanitizes a string so it could be used as part of a filename.
383 If restricted is set, use a stricter subset of allowed characters.
384 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
386 def replace_insane(char):
387 if restricted and char in ACCENT_CHARS:
388 return ACCENT_CHARS[char]
389 if char == '?' or ord(char) < 32 or ord(char) == 127:
392 return '' if restricted else '\''
394 return '_-' if restricted else ' -'
395 elif char in '\\/|*<>':
397 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
399 if restricted and ord(char) > 127:
404 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
405 result = ''.join(map(replace_insane, s))
407 while '__' in result:
408 result = result.replace('__', '_')
409 result = result.strip('_')
410 # Common case of "Foreign band name - English song title"
411 if restricted and result.startswith('-_'):
413 if result.startswith('-'):
414 result = '_' + result[len('-'):]
415 result = result.lstrip('.')
421 def sanitize_path(s):
422 """Sanitizes and normalizes path on Windows"""
423 if sys.platform != 'win32':
425 drive_or_unc, _ = os.path.splitdrive(s)
426 if sys.version_info < (2, 7) and not drive_or_unc:
427 drive_or_unc, _ = os.path.splitunc(s)
428 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
432 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
433 for path_part in norm_path]
435 sanitized_path.insert(0, drive_or_unc + os.path.sep)
436 return os.path.join(*sanitized_path)
439 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
440 # unwanted failures due to missing protocol
441 def sanitize_url(url):
442 return 'http:%s' % url if url.startswith('//') else url
445 def sanitized_Request(url, *args, **kwargs):
446 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
449 def orderedSet(iterable):
450 """ Remove all duplicates from the input iterable """
458 def _htmlentity_transform(entity):
459 """Transforms an HTML entity to a character."""
460 # Known non-numeric HTML entity
461 if entity in compat_html_entities.name2codepoint:
462 return compat_chr(compat_html_entities.name2codepoint[entity])
464 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
466 numstr = mobj.group(1)
467 if numstr.startswith('x'):
469 numstr = '0%s' % numstr
472 # See https://github.com/rg3/youtube-dl/issues/7518
474 return compat_chr(int(numstr, base))
478 # Unknown entity in name, return its literal representation
479 return '&%s;' % entity
485 assert type(s) == compat_str
488 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
491 def get_subprocess_encoding():
492 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
493 # For subprocess calls, encode with locale encoding
494 # Refer to http://stackoverflow.com/a/9951851/35070
495 encoding = preferredencoding()
497 encoding = sys.getfilesystemencoding()
503 def encodeFilename(s, for_subprocess=False):
505 @param s The name of the file
508 assert type(s) == compat_str
510 # Python 3 has a Unicode API
511 if sys.version_info >= (3, 0):
514 # Pass '' directly to use Unicode APIs on Windows 2000 and up
515 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
516 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
517 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
520 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
521 if sys.platform.startswith('java'):
524 return s.encode(get_subprocess_encoding(), 'ignore')
527 def decodeFilename(b, for_subprocess=False):
529 if sys.version_info >= (3, 0):
532 if not isinstance(b, bytes):
535 return b.decode(get_subprocess_encoding(), 'ignore')
538 def encodeArgument(s):
539 if not isinstance(s, compat_str):
540 # Legacy code that uses byte strings
541 # Uncomment the following line after fixing all post processors
542 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
543 s = s.decode('ascii')
544 return encodeFilename(s, True)
547 def decodeArgument(b):
548 return decodeFilename(b, True)
551 def decodeOption(optval):
554 if isinstance(optval, bytes):
555 optval = optval.decode(preferredencoding())
557 assert isinstance(optval, compat_str)
561 def formatSeconds(secs):
563 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
565 return '%d:%02d' % (secs // 60, secs % 60)
570 def make_HTTPS_handler(params, **kwargs):
571 opts_no_check_certificate = params.get('nocheckcertificate', False)
572 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
573 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
574 if opts_no_check_certificate:
575 context.check_hostname = False
576 context.verify_mode = ssl.CERT_NONE
578 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
581 # (create_default_context present but HTTPSHandler has no context=)
584 if sys.version_info < (3, 2):
585 return YoutubeDLHTTPSHandler(params, **kwargs)
587 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
588 context.verify_mode = (ssl.CERT_NONE
589 if opts_no_check_certificate
590 else ssl.CERT_REQUIRED)
591 context.set_default_verify_paths()
592 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
595 def bug_reports_message():
596 if ytdl_is_updateable():
597 update_cmd = 'type youtube-dl -U to update'
599 update_cmd = 'see https://yt-dl.org/update on how to update'
600 msg = '; please report this issue on https://yt-dl.org/bug .'
601 msg += ' Make sure you are using the latest version; %s.' % update_cmd
602 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
606 class ExtractorError(Exception):
607 """Error during info extraction."""
609 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
610 """ tb, if given, is the original traceback (so that it can be printed out).
611 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
614 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
616 if video_id is not None:
617 msg = video_id + ': ' + msg
619 msg += ' (caused by %r)' % cause
621 msg += bug_reports_message()
622 super(ExtractorError, self).__init__(msg)
625 self.exc_info = sys.exc_info() # preserve original exception
627 self.video_id = video_id
629 def format_traceback(self):
630 if self.traceback is None:
632 return ''.join(traceback.format_tb(self.traceback))
635 class UnsupportedError(ExtractorError):
636 def __init__(self, url):
637 super(UnsupportedError, self).__init__(
638 'Unsupported URL: %s' % url, expected=True)
642 class RegexNotFoundError(ExtractorError):
643 """Error when a regex didn't match"""
647 class DownloadError(Exception):
648 """Download Error exception.
650 This exception may be thrown by FileDownloader objects if they are not
651 configured to continue on errors. They will contain the appropriate
655 def __init__(self, msg, exc_info=None):
656 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
657 super(DownloadError, self).__init__(msg)
658 self.exc_info = exc_info
661 class SameFileError(Exception):
662 """Same File exception.
664 This exception will be thrown by FileDownloader objects if they detect
665 multiple files would have to be downloaded to the same file on disk.
670 class PostProcessingError(Exception):
671 """Post Processing exception.
673 This exception may be raised by PostProcessor's .run() method to
674 indicate an error in the postprocessing task.
677 def __init__(self, msg):
681 class MaxDownloadsReached(Exception):
682 """ --max-downloads limit has been reached. """
686 class UnavailableVideoError(Exception):
687 """Unavailable Format exception.
689 This exception will be thrown when a video is requested
690 in a format that is not available for that video.
695 class ContentTooShortError(Exception):
696 """Content Too Short exception.
698 This exception may be raised by FileDownloader objects when a file they
699 download is too small for what the server announced first, indicating
700 the connection was probably interrupted.
703 def __init__(self, downloaded, expected):
705 self.downloaded = downloaded
706 self.expected = expected
709 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
710 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
711 # expected HTTP responses to meet HTTP/1.0 or later (see also
712 # https://github.com/rg3/youtube-dl/issues/6727)
713 if sys.version_info < (3, 0):
714 kwargs[b'strict'] = True
715 hc = http_class(*args, **kwargs)
716 source_address = ydl_handler._params.get('source_address')
717 if source_address is not None:
718 sa = (source_address, 0)
719 if hasattr(hc, 'source_address'): # Python 2.7+
720 hc.source_address = sa
722 def _hc_connect(self, *args, **kwargs):
723 sock = compat_socket_create_connection(
724 (self.host, self.port), self.timeout, sa)
726 self.sock = ssl.wrap_socket(
727 sock, self.key_file, self.cert_file,
728 ssl_version=ssl.PROTOCOL_TLSv1)
731 hc.connect = functools.partial(_hc_connect, hc)
736 def handle_youtubedl_headers(headers):
737 filtered_headers = headers
739 if 'Youtubedl-no-compression' in filtered_headers:
740 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
741 del filtered_headers['Youtubedl-no-compression']
743 return filtered_headers
746 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
747 """Handler for HTTP requests and responses.
749 This class, when installed with an OpenerDirector, automatically adds
750 the standard headers to every HTTP request and handles gzipped and
751 deflated responses from web servers. If compression is to be avoided in
752 a particular request, the original request in the program code only has
753 to include the HTTP header "Youtubedl-no-compression", which will be
754 removed before making the real request.
756 Part of this code was copied from:
758 http://techknack.net/python-urllib2-handlers/
760 Andrew Rowls, the author of that code, agreed to release it to the
764 def __init__(self, params, *args, **kwargs):
765 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
766 self._params = params
768 def http_open(self, req):
769 conn_class = compat_http_client.HTTPConnection
771 socks_proxy = req.headers.get('Ytdl-socks-proxy')
773 conn_class = make_socks_conn_class(conn_class, socks_proxy)
774 del req.headers['Ytdl-socks-proxy']
776 return self.do_open(functools.partial(
777 _create_http_connection, self, conn_class, False),
783 return zlib.decompress(data, -zlib.MAX_WBITS)
785 return zlib.decompress(data)
788 def addinfourl_wrapper(stream, headers, url, code):
789 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
790 return compat_urllib_request.addinfourl(stream, headers, url, code)
791 ret = compat_urllib_request.addinfourl(stream, headers, url)
795 def http_request(self, req):
796 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
797 # always respected by websites, some tend to give out URLs with non percent-encoded
798 # non-ASCII characters (see telemb.py, ard.py [#3412])
799 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
800 # To work around aforementioned issue we will replace request's original URL with
801 # percent-encoded one
802 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
803 # the code of this workaround has been moved here from YoutubeDL.urlopen()
804 url = req.get_full_url()
805 url_escaped = escape_url(url)
807 # Substitute URL if any change after escaping
808 if url != url_escaped:
809 req = update_Request(req, url=url_escaped)
811 for h, v in std_headers.items():
812 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
813 # The dict keys are capitalized because of this bug by urllib
814 if h.capitalize() not in req.headers:
817 req.headers = handle_youtubedl_headers(req.headers)
819 if sys.version_info < (2, 7) and '#' in req.get_full_url():
820 # Python 2.6 is brain-dead when it comes to fragments
821 req._Request__original = req._Request__original.partition('#')[0]
822 req._Request__r_type = req._Request__r_type.partition('#')[0]
826 def http_response(self, req, resp):
829 if resp.headers.get('Content-encoding', '') == 'gzip':
830 content = resp.read()
831 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
833 uncompressed = io.BytesIO(gz.read())
834 except IOError as original_ioerror:
835 # There may be junk add the end of the file
836 # See http://stackoverflow.com/q/4928560/35070 for details
837 for i in range(1, 1024):
839 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
840 uncompressed = io.BytesIO(gz.read())
845 raise original_ioerror
846 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
847 resp.msg = old_resp.msg
848 del resp.headers['Content-encoding']
850 if resp.headers.get('Content-encoding', '') == 'deflate':
851 gz = io.BytesIO(self.deflate(resp.read()))
852 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
853 resp.msg = old_resp.msg
854 del resp.headers['Content-encoding']
855 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
856 # https://github.com/rg3/youtube-dl/issues/6457).
857 if 300 <= resp.code < 400:
858 location = resp.headers.get('Location')
860 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
861 if sys.version_info >= (3, 0):
862 location = location.encode('iso-8859-1').decode('utf-8')
863 location_escaped = escape_url(location)
864 if location != location_escaped:
865 del resp.headers['Location']
866 resp.headers['Location'] = location_escaped
869 https_request = http_request
870 https_response = http_response
873 def make_socks_conn_class(base_class, socks_proxy):
874 assert issubclass(base_class, (
875 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
877 url_components = compat_urlparse.urlparse(socks_proxy)
878 if url_components.scheme.lower() == 'socks5':
879 socks_type = ProxyType.SOCKS5
880 elif url_components.scheme.lower() in ('socks', 'socks4'):
881 socks_type = ProxyType.SOCKS4
882 elif url_components.scheme.lower() == 'socks4a':
883 socks_type = ProxyType.SOCKS4A
887 url_components.hostname, url_components.port or 1080,
889 url_components.username, url_components.password
892 class SocksConnection(base_class):
894 self.sock = sockssocket()
895 self.sock.setproxy(*proxy_args)
896 if type(self.timeout) in (int, float):
897 self.sock.settimeout(self.timeout)
898 self.sock.connect((self.host, self.port))
900 if isinstance(self, compat_http_client.HTTPSConnection):
901 if hasattr(self, '_context'): # Python > 2.6
902 self.sock = self._context.wrap_socket(
903 self.sock, server_hostname=self.host)
905 self.sock = ssl.wrap_socket(self.sock)
907 return SocksConnection
910 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
911 def __init__(self, params, https_conn_class=None, *args, **kwargs):
912 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
913 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
914 self._params = params
916 def https_open(self, req):
918 conn_class = self._https_conn_class
920 if hasattr(self, '_context'): # python > 2.6
921 kwargs['context'] = self._context
922 if hasattr(self, '_check_hostname'): # python 3.x
923 kwargs['check_hostname'] = self._check_hostname
925 socks_proxy = req.headers.get('Ytdl-socks-proxy')
927 conn_class = make_socks_conn_class(conn_class, socks_proxy)
928 del req.headers['Ytdl-socks-proxy']
930 return self.do_open(functools.partial(
931 _create_http_connection, self, conn_class, True),
935 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
936 def __init__(self, cookiejar=None):
937 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
939 def http_response(self, request, response):
940 # Python 2 will choke on next HTTP request in row if there are non-ASCII
941 # characters in Set-Cookie HTTP header of last response (see
942 # https://github.com/rg3/youtube-dl/issues/6769).
943 # In order to at least prevent crashing we will percent encode Set-Cookie
944 # header before HTTPCookieProcessor starts processing it.
945 # if sys.version_info < (3, 0) and response.headers:
946 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
947 # set_cookie = response.headers.get(set_cookie_header)
949 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
950 # if set_cookie != set_cookie_escaped:
951 # del response.headers[set_cookie_header]
952 # response.headers[set_cookie_header] = set_cookie_escaped
953 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
955 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
956 https_response = http_response
959 def parse_iso8601(date_str, delimiter='T', timezone=None):
960 """ Return a UNIX timestamp from the given date """
965 date_str = re.sub(r'\.[0-9]+', '', date_str)
969 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
972 timezone = datetime.timedelta()
974 date_str = date_str[:-len(m.group(0))]
975 if not m.group('sign'):
976 timezone = datetime.timedelta()
978 sign = 1 if m.group('sign') == '+' else -1
979 timezone = datetime.timedelta(
980 hours=sign * int(m.group('hours')),
981 minutes=sign * int(m.group('minutes')))
983 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
984 dt = datetime.datetime.strptime(date_str, date_format) - timezone
985 return calendar.timegm(dt.timetuple())
990 def unified_strdate(date_str, day_first=True):
991 """Return a string with the date in the format YYYYMMDD"""
997 date_str = date_str.replace(',', ' ')
998 # %z (UTC offset) is only supported in python>=3.2
999 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1000 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1001 # Remove AM/PM + timezone
1002 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1004 format_expressions = [
1015 '%Y/%m/%d %H:%M:%S',
1016 '%Y-%m-%d %H:%M:%S',
1017 '%Y-%m-%d %H:%M:%S.%f',
1020 '%Y-%m-%dT%H:%M:%SZ',
1021 '%Y-%m-%dT%H:%M:%S.%fZ',
1022 '%Y-%m-%dT%H:%M:%S.%f0Z',
1023 '%Y-%m-%dT%H:%M:%S',
1024 '%Y-%m-%dT%H:%M:%S.%f',
1028 format_expressions.extend([
1033 '%d/%m/%Y %H:%M:%S',
1036 format_expressions.extend([
1041 '%m/%d/%Y %H:%M:%S',
1043 for expression in format_expressions:
1045 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1048 if upload_date is None:
1049 timetuple = email.utils.parsedate_tz(date_str)
1051 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1052 if upload_date is not None:
1053 return compat_str(upload_date)
1056 def determine_ext(url, default_ext='unknown_video'):
1059 guess = url.partition('?')[0].rpartition('.')[2]
1060 if re.match(r'^[A-Za-z0-9]+$', guess):
1062 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1063 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1064 return guess.rstrip('/')
1069 def subtitles_filename(filename, sub_lang, sub_format):
1070 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1073 def date_from_str(date_str):
1075 Return a datetime object from a string in the format YYYYMMDD or
1076 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1077 today = datetime.date.today()
1078 if date_str in ('now', 'today'):
1080 if date_str == 'yesterday':
1081 return today - datetime.timedelta(days=1)
1082 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1083 if match is not None:
1084 sign = match.group('sign')
1085 time = int(match.group('time'))
1088 unit = match.group('unit')
1089 # A bad approximation?
1093 elif unit == 'year':
1097 delta = datetime.timedelta(**{unit: time})
1098 return today + delta
1099 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1102 def hyphenate_date(date_str):
1104 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1105 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1106 if match is not None:
1107 return '-'.join(match.groups())
1112 class DateRange(object):
1113 """Represents a time interval between two dates"""
1115 def __init__(self, start=None, end=None):
1116 """start and end must be strings in the format accepted by date"""
1117 if start is not None:
1118 self.start = date_from_str(start)
1120 self.start = datetime.datetime.min.date()
1122 self.end = date_from_str(end)
1124 self.end = datetime.datetime.max.date()
1125 if self.start > self.end:
1126 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1130 """Returns a range that only contains the given day"""
1131 return cls(day, day)
1133 def __contains__(self, date):
1134 """Check if the date is in the range"""
1135 if not isinstance(date, datetime.date):
1136 date = date_from_str(date)
1137 return self.start <= date <= self.end
1140 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1143 def platform_name():
1144 """ Returns the platform name as a compat_str """
1145 res = platform.platform()
1146 if isinstance(res, bytes):
1147 res = res.decode(preferredencoding())
1149 assert isinstance(res, compat_str)
1153 def _windows_write_string(s, out):
1154 """ Returns True if the string was written using special methods,
1155 False if it has yet to be written out."""
1156 # Adapted from http://stackoverflow.com/a/3259271/35070
1159 import ctypes.wintypes
1167 fileno = out.fileno()
1168 except AttributeError:
1169 # If the output stream doesn't have a fileno, it's virtual
1171 except io.UnsupportedOperation:
1172 # Some strange Windows pseudo files?
1174 if fileno not in WIN_OUTPUT_IDS:
1177 GetStdHandle = ctypes.WINFUNCTYPE(
1178 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1179 (b'GetStdHandle', ctypes.windll.kernel32))
1180 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1182 WriteConsoleW = ctypes.WINFUNCTYPE(
1183 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1184 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1185 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1186 written = ctypes.wintypes.DWORD(0)
1188 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1189 FILE_TYPE_CHAR = 0x0002
1190 FILE_TYPE_REMOTE = 0x8000
1191 GetConsoleMode = ctypes.WINFUNCTYPE(
1192 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1193 ctypes.POINTER(ctypes.wintypes.DWORD))(
1194 (b'GetConsoleMode', ctypes.windll.kernel32))
1195 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1197 def not_a_console(handle):
1198 if handle == INVALID_HANDLE_VALUE or handle is None:
1200 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1201 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1203 if not_a_console(h):
1206 def next_nonbmp_pos(s):
1208 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1209 except StopIteration:
1213 count = min(next_nonbmp_pos(s), 1024)
1215 ret = WriteConsoleW(
1216 h, s, count if count else 2, ctypes.byref(written), None)
1218 raise OSError('Failed to write string')
1219 if not count: # We just wrote a non-BMP character
1220 assert written.value == 2
1223 assert written.value > 0
1224 s = s[written.value:]
1228 def write_string(s, out=None, encoding=None):
1231 assert type(s) == compat_str
1233 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1234 if _windows_write_string(s, out):
1237 if ('b' in getattr(out, 'mode', '') or
1238 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1239 byt = s.encode(encoding or preferredencoding(), 'ignore')
1241 elif hasattr(out, 'buffer'):
1242 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1243 byt = s.encode(enc, 'ignore')
1244 out.buffer.write(byt)
1250 def bytes_to_intlist(bs):
1253 if isinstance(bs[0], int): # Python 3
1256 return [ord(c) for c in bs]
1259 def intlist_to_bytes(xs):
1262 return compat_struct_pack('%dB' % len(xs), *xs)
1265 # Cross-platform file locking
1266 if sys.platform == 'win32':
1267 import ctypes.wintypes
1270 class OVERLAPPED(ctypes.Structure):
1272 ('Internal', ctypes.wintypes.LPVOID),
1273 ('InternalHigh', ctypes.wintypes.LPVOID),
1274 ('Offset', ctypes.wintypes.DWORD),
1275 ('OffsetHigh', ctypes.wintypes.DWORD),
1276 ('hEvent', ctypes.wintypes.HANDLE),
1279 kernel32 = ctypes.windll.kernel32
1280 LockFileEx = kernel32.LockFileEx
1281 LockFileEx.argtypes = [
1282 ctypes.wintypes.HANDLE, # hFile
1283 ctypes.wintypes.DWORD, # dwFlags
1284 ctypes.wintypes.DWORD, # dwReserved
1285 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1286 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1287 ctypes.POINTER(OVERLAPPED) # Overlapped
1289 LockFileEx.restype = ctypes.wintypes.BOOL
1290 UnlockFileEx = kernel32.UnlockFileEx
1291 UnlockFileEx.argtypes = [
1292 ctypes.wintypes.HANDLE, # hFile
1293 ctypes.wintypes.DWORD, # dwReserved
1294 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1295 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1296 ctypes.POINTER(OVERLAPPED) # Overlapped
1298 UnlockFileEx.restype = ctypes.wintypes.BOOL
1299 whole_low = 0xffffffff
1300 whole_high = 0x7fffffff
1302 def _lock_file(f, exclusive):
1303 overlapped = OVERLAPPED()
1304 overlapped.Offset = 0
1305 overlapped.OffsetHigh = 0
1306 overlapped.hEvent = 0
1307 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1308 handle = msvcrt.get_osfhandle(f.fileno())
1309 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1310 whole_low, whole_high, f._lock_file_overlapped_p):
1311 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1313 def _unlock_file(f):
1314 assert f._lock_file_overlapped_p
1315 handle = msvcrt.get_osfhandle(f.fileno())
1316 if not UnlockFileEx(handle, 0,
1317 whole_low, whole_high, f._lock_file_overlapped_p):
1318 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1321 # Some platforms, such as Jython, is missing fcntl
1325 def _lock_file(f, exclusive):
1326 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1328 def _unlock_file(f):
1329 fcntl.flock(f, fcntl.LOCK_UN)
1331 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1333 def _lock_file(f, exclusive):
1334 raise IOError(UNSUPPORTED_MSG)
1336 def _unlock_file(f):
1337 raise IOError(UNSUPPORTED_MSG)
1340 class locked_file(object):
1341 def __init__(self, filename, mode, encoding=None):
1342 assert mode in ['r', 'a', 'w']
1343 self.f = io.open(filename, mode, encoding=encoding)
1346 def __enter__(self):
1347 exclusive = self.mode != 'r'
1349 _lock_file(self.f, exclusive)
1355 def __exit__(self, etype, value, traceback):
1357 _unlock_file(self.f)
1364 def write(self, *args):
1365 return self.f.write(*args)
1367 def read(self, *args):
1368 return self.f.read(*args)
1371 def get_filesystem_encoding():
1372 encoding = sys.getfilesystemencoding()
1373 return encoding if encoding is not None else 'utf-8'
1376 def shell_quote(args):
1378 encoding = get_filesystem_encoding()
1380 if isinstance(a, bytes):
1381 # We may get a filename encoded with 'encodeFilename'
1382 a = a.decode(encoding)
1383 quoted_args.append(pipes.quote(a))
1384 return ' '.join(quoted_args)
1387 def smuggle_url(url, data):
1388 """ Pass additional data in a URL for internal use. """
1390 sdata = compat_urllib_parse_urlencode(
1391 {'__youtubedl_smuggle': json.dumps(data)})
1392 return url + '#' + sdata
1395 def unsmuggle_url(smug_url, default=None):
1396 if '#__youtubedl_smuggle' not in smug_url:
1397 return smug_url, default
1398 url, _, sdata = smug_url.rpartition('#')
1399 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1400 data = json.loads(jsond)
1404 def format_bytes(bytes):
1407 if type(bytes) is str:
1408 bytes = float(bytes)
1412 exponent = int(math.log(bytes, 1024.0))
1413 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1414 converted = float(bytes) / float(1024 ** exponent)
1415 return '%.2f%s' % (converted, suffix)
1418 def lookup_unit_table(unit_table, s):
1419 units_re = '|'.join(re.escape(u) for u in unit_table)
1421 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1424 num_str = m.group('num').replace(',', '.')
1425 mult = unit_table[m.group('unit')]
1426 return int(float(num_str) * mult)
1429 def parse_filesize(s):
1433 # The lower-case forms are of course incorrect and unofficial,
1434 # but we support those too
1472 return lookup_unit_table(_UNIT_TABLE, s)
1481 if re.match(r'^[\d,.]+$', s):
1482 return str_to_int(s)
1493 return lookup_unit_table(_UNIT_TABLE, s)
1496 def month_by_name(name):
1497 """ Return the number of a month by (locale-independently) English name """
1500 return ENGLISH_MONTH_NAMES.index(name) + 1
1505 def month_by_abbreviation(abbrev):
1506 """ Return the number of a month by (locale-independently) English
1510 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1515 def fix_xml_ampersands(xml_str):
1516 """Replace all the '&' by '&' in XML"""
1518 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1523 def setproctitle(title):
1524 assert isinstance(title, compat_str)
1526 # ctypes in Jython is not complete
1527 # http://bugs.jython.org/issue2148
1528 if sys.platform.startswith('java'):
1532 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1535 title_bytes = title.encode('utf-8')
1536 buf = ctypes.create_string_buffer(len(title_bytes))
1537 buf.value = title_bytes
1539 libc.prctl(15, buf, 0, 0, 0)
1540 except AttributeError:
1541 return # Strange libc, just skip this
1544 def remove_start(s, start):
1545 if s.startswith(start):
1546 return s[len(start):]
1550 def remove_end(s, end):
1552 return s[:-len(end)]
1556 def remove_quotes(s):
1557 if s is None or len(s) < 2:
1559 for quote in ('"', "'", ):
1560 if s[0] == quote and s[-1] == quote:
1565 def url_basename(url):
1566 path = compat_urlparse.urlparse(url).path
1567 return path.strip('/').split('/')[-1]
1570 class HEADRequest(compat_urllib_request.Request):
1571 def get_method(self):
1575 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1578 v = getattr(v, get_attr, None)
1584 return int(v) * invscale // scale
1589 def str_or_none(v, default=None):
1590 return default if v is None else compat_str(v)
1593 def str_to_int(int_str):
1594 """ A more relaxed version of int_or_none """
1597 int_str = re.sub(r'[,\.\+]', '', int_str)
1601 def float_or_none(v, scale=1, invscale=1, default=None):
1605 return float(v) * invscale / scale
1610 def parse_duration(s):
1611 if not isinstance(s, compat_basestring):
1616 days, hours, mins, secs, ms = [None] * 5
1617 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1619 days, hours, mins, secs, ms = m.groups()
1624 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1627 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1630 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1633 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1636 days, hours, mins, secs, ms = m.groups()
1638 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1640 hours, mins = m.groups()
1646 duration += float(secs)
1648 duration += float(mins) * 60
1650 duration += float(hours) * 60 * 60
1652 duration += float(days) * 24 * 60 * 60
1654 duration += float(ms)
1658 def prepend_extension(filename, ext, expected_real_ext=None):
1659 name, real_ext = os.path.splitext(filename)
1661 '{0}.{1}{2}'.format(name, ext, real_ext)
1662 if not expected_real_ext or real_ext[1:] == expected_real_ext
1663 else '{0}.{1}'.format(filename, ext))
1666 def replace_extension(filename, ext, expected_real_ext=None):
1667 name, real_ext = os.path.splitext(filename)
1668 return '{0}.{1}'.format(
1669 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1673 def check_executable(exe, args=[]):
1674 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1675 args can be a list of arguments for a short output (like -version) """
1677 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1683 def get_exe_version(exe, args=['--version'],
1684 version_re=None, unrecognized='present'):
1685 """ Returns the version of the specified executable,
1686 or False if the executable is not present """
1688 out, _ = subprocess.Popen(
1689 [encodeArgument(exe)] + args,
1690 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1693 if isinstance(out, bytes): # Python 2.x
1694 out = out.decode('ascii', 'ignore')
1695 return detect_exe_version(out, version_re, unrecognized)
1698 def detect_exe_version(output, version_re=None, unrecognized='present'):
1699 assert isinstance(output, compat_str)
1700 if version_re is None:
1701 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1702 m = re.search(version_re, output)
1709 class PagedList(object):
1711 # This is only useful for tests
1712 return len(self.getslice())
1715 class OnDemandPagedList(PagedList):
1716 def __init__(self, pagefunc, pagesize, use_cache=False):
1717 self._pagefunc = pagefunc
1718 self._pagesize = pagesize
1719 self._use_cache = use_cache
1723 def getslice(self, start=0, end=None):
1725 for pagenum in itertools.count(start // self._pagesize):
1726 firstid = pagenum * self._pagesize
1727 nextfirstid = pagenum * self._pagesize + self._pagesize
1728 if start >= nextfirstid:
1733 page_results = self._cache.get(pagenum)
1734 if page_results is None:
1735 page_results = list(self._pagefunc(pagenum))
1737 self._cache[pagenum] = page_results
1740 start % self._pagesize
1741 if firstid <= start < nextfirstid
1745 ((end - 1) % self._pagesize) + 1
1746 if (end is not None and firstid <= end <= nextfirstid)
1749 if startv != 0 or endv is not None:
1750 page_results = page_results[startv:endv]
1751 res.extend(page_results)
1753 # A little optimization - if current page is not "full", ie. does
1754 # not contain page_size videos then we can assume that this page
1755 # is the last one - there are no more ids on further pages -
1756 # i.e. no need to query again.
1757 if len(page_results) + startv < self._pagesize:
1760 # If we got the whole page, but the next page is not interesting,
1761 # break out early as well
1762 if end == nextfirstid:
1767 class InAdvancePagedList(PagedList):
1768 def __init__(self, pagefunc, pagecount, pagesize):
1769 self._pagefunc = pagefunc
1770 self._pagecount = pagecount
1771 self._pagesize = pagesize
1773 def getslice(self, start=0, end=None):
1775 start_page = start // self._pagesize
1777 self._pagecount if end is None else (end // self._pagesize + 1))
1778 skip_elems = start - start_page * self._pagesize
1779 only_more = None if end is None else end - start
1780 for pagenum in range(start_page, end_page):
1781 page = list(self._pagefunc(pagenum))
1783 page = page[skip_elems:]
1785 if only_more is not None:
1786 if len(page) < only_more:
1787 only_more -= len(page)
1789 page = page[:only_more]
1796 def uppercase_escape(s):
1797 unicode_escape = codecs.getdecoder('unicode_escape')
1799 r'\\U[0-9a-fA-F]{8}',
1800 lambda m: unicode_escape(m.group(0))[0],
1804 def lowercase_escape(s):
1805 unicode_escape = codecs.getdecoder('unicode_escape')
1807 r'\\u[0-9a-fA-F]{4}',
1808 lambda m: unicode_escape(m.group(0))[0],
1812 def escape_rfc3986(s):
1813 """Escape non-ASCII characters as suggested by RFC 3986"""
1814 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1815 s = s.encode('utf-8')
1816 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1819 def escape_url(url):
1820 """Escape URL as suggested by RFC 3986"""
1821 url_parsed = compat_urllib_parse_urlparse(url)
1822 return url_parsed._replace(
1823 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1824 path=escape_rfc3986(url_parsed.path),
1825 params=escape_rfc3986(url_parsed.params),
1826 query=escape_rfc3986(url_parsed.query),
1827 fragment=escape_rfc3986(url_parsed.fragment)
1831 def read_batch_urls(batch_fd):
1833 if not isinstance(url, compat_str):
1834 url = url.decode('utf-8', 'replace')
1835 BOM_UTF8 = '\xef\xbb\xbf'
1836 if url.startswith(BOM_UTF8):
1837 url = url[len(BOM_UTF8):]
1839 if url.startswith(('#', ';', ']')):
1843 with contextlib.closing(batch_fd) as fd:
1844 return [url for url in map(fixup, fd) if url]
1847 def urlencode_postdata(*args, **kargs):
1848 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1851 def update_url_query(url, query):
1854 parsed_url = compat_urlparse.urlparse(url)
1855 qs = compat_parse_qs(parsed_url.query)
1857 return compat_urlparse.urlunparse(parsed_url._replace(
1858 query=compat_urllib_parse_urlencode(qs, True)))
1861 def update_Request(req, url=None, data=None, headers={}, query={}):
1862 req_headers = req.headers.copy()
1863 req_headers.update(headers)
1864 req_data = data or req.data
1865 req_url = update_url_query(url or req.get_full_url(), query)
1866 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1868 req_url, data=req_data, headers=req_headers,
1869 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1870 if hasattr(req, 'timeout'):
1871 new_req.timeout = req.timeout
1875 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1876 if isinstance(key_or_keys, (list, tuple)):
1877 for key in key_or_keys:
1878 if key not in d or d[key] is None or skip_false_values and not d[key]:
1882 return d.get(key_or_keys, default)
1885 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1886 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1898 def parse_age_limit(s):
1901 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1902 return int(m.group('age')) if m else US_RATINGS.get(s)
1905 def strip_jsonp(code):
1907 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1910 def js_to_json(code):
1913 if v in ('true', 'false', 'null'):
1915 if v.startswith('"'):
1916 v = re.sub(r"\\'", "'", v[1:-1])
1917 elif v.startswith("'"):
1919 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1926 res = re.sub(r'''(?x)
1927 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1928 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1929 [a-zA-Z_][.a-zA-Z_0-9]*
1931 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1935 def qualities(quality_ids):
1936 """ Get a numeric quality value out of a list of possible values """
1939 return quality_ids.index(qid)
1945 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1948 def limit_length(s, length):
1949 """ Add ellipses to overly long strings """
1954 return s[:length - len(ELLIPSES)] + ELLIPSES
1958 def version_tuple(v):
1959 return tuple(int(e) for e in re.split(r'[-.]', v))
1962 def is_outdated_version(version, limit, assume_new=True):
1964 return not assume_new
1966 return version_tuple(version) < version_tuple(limit)
1968 return not assume_new
1971 def ytdl_is_updateable():
1972 """ Returns if youtube-dl can be updated with -U """
1973 from zipimport import zipimporter
1975 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1978 def args_to_str(args):
1979 # Get a short string representation for a subprocess command
1980 return ' '.join(compat_shlex_quote(a) for a in args)
1983 def error_to_compat_str(err):
1985 # On python 2 error byte string must be decoded with proper
1986 # encoding rather than ascii
1987 if sys.version_info[0] < 3:
1988 err_str = err_str.decode(preferredencoding())
1992 def mimetype2ext(mt):
2002 _, _, res = mt.rpartition('/')
2006 'smptett+xml': 'tt',
2012 'x-mp4-fragmented': 'mp4',
2017 def urlhandle_detect_ext(url_handle):
2020 getheader = lambda h: url_handle.headers[h]
2021 except AttributeError: # Python < 3
2022 getheader = url_handle.info().getheader
2024 cd = getheader('Content-Disposition')
2026 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2028 e = determine_ext(m.group('filename'), default_ext=None)
2032 return mimetype2ext(getheader('Content-Type'))
2035 def encode_data_uri(data, mime_type):
2036 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2039 def age_restricted(content_limit, age_limit):
2040 """ Returns True iff the content should be blocked """
2042 if age_limit is None: # No limit set
2044 if content_limit is None:
2045 return False # Content available for everyone
2046 return age_limit < content_limit
2049 def is_html(first_bytes):
2050 """ Detect whether a file contains HTML by examining its first bytes. """
2053 (b'\xef\xbb\xbf', 'utf-8'),
2054 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2055 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2056 (b'\xff\xfe', 'utf-16-le'),
2057 (b'\xfe\xff', 'utf-16-be'),
2059 for bom, enc in BOMS:
2060 if first_bytes.startswith(bom):
2061 s = first_bytes[len(bom):].decode(enc, 'replace')
2064 s = first_bytes.decode('utf-8', 'replace')
2066 return re.match(r'^\s*<', s)
2069 def determine_protocol(info_dict):
2070 protocol = info_dict.get('protocol')
2071 if protocol is not None:
2074 url = info_dict['url']
2075 if url.startswith('rtmp'):
2077 elif url.startswith('mms'):
2079 elif url.startswith('rtsp'):
2082 ext = determine_ext(url)
2088 return compat_urllib_parse_urlparse(url).scheme
2091 def render_table(header_row, data):
2092 """ Render a list of rows, each as a list of values """
2093 table = [header_row] + data
2094 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2095 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2096 return '\n'.join(format_str % tuple(row) for row in table)
2099 def _match_one(filter_part, dct):
2100 COMPARISON_OPERATORS = {
2108 operator_rex = re.compile(r'''(?x)\s*
2110 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2112 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2113 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2116 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2117 m = operator_rex.search(filter_part)
2119 op = COMPARISON_OPERATORS[m.group('op')]
2120 if m.group('strval') is not None:
2121 if m.group('op') not in ('=', '!='):
2123 'Operator %s does not support string values!' % m.group('op'))
2124 comparison_value = m.group('strval')
2127 comparison_value = int(m.group('intval'))
2129 comparison_value = parse_filesize(m.group('intval'))
2130 if comparison_value is None:
2131 comparison_value = parse_filesize(m.group('intval') + 'B')
2132 if comparison_value is None:
2134 'Invalid integer value %r in filter part %r' % (
2135 m.group('intval'), filter_part))
2136 actual_value = dct.get(m.group('key'))
2137 if actual_value is None:
2138 return m.group('none_inclusive')
2139 return op(actual_value, comparison_value)
2142 '': lambda v: v is not None,
2143 '!': lambda v: v is None,
2145 operator_rex = re.compile(r'''(?x)\s*
2146 (?P<op>%s)\s*(?P<key>[a-z_]+)
2148 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2149 m = operator_rex.search(filter_part)
2151 op = UNARY_OPERATORS[m.group('op')]
2152 actual_value = dct.get(m.group('key'))
2153 return op(actual_value)
2155 raise ValueError('Invalid filter part %r' % filter_part)
2158 def match_str(filter_str, dct):
2159 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2162 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2165 def match_filter_func(filter_str):
2166 def _match_func(info_dict):
2167 if match_str(filter_str, info_dict):
2170 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2171 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2175 def parse_dfxp_time_expr(time_expr):
2179 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2181 return float(mobj.group('time_offset'))
2183 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2185 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2188 def srt_subtitles_timecode(seconds):
2189 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2192 def dfxp2srt(dfxp_data):
2193 _x = functools.partial(xpath_with_ns, ns_map={
2194 'ttml': 'http://www.w3.org/ns/ttml',
2195 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2196 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2199 class TTMLPElementParser(object):
2202 def start(self, tag, attrib):
2203 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2209 def data(self, data):
2213 return self.out.strip()
2215 def parse_node(node):
2216 target = TTMLPElementParser()
2217 parser = xml.etree.ElementTree.XMLParser(target=target)
2218 parser.feed(xml.etree.ElementTree.tostring(node))
2219 return parser.close()
2221 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2223 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2226 raise ValueError('Invalid dfxp/TTML subtitle')
2228 for para, index in zip(paras, itertools.count(1)):
2229 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2230 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2231 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2232 if begin_time is None:
2237 end_time = begin_time + dur
2238 out.append('%d\n%s --> %s\n%s\n\n' % (
2240 srt_subtitles_timecode(begin_time),
2241 srt_subtitles_timecode(end_time),
2247 def cli_option(params, command_option, param):
2248 param = params.get(param)
2249 return [command_option, param] if param is not None else []
2252 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2253 param = params.get(param)
2254 assert isinstance(param, bool)
2256 return [command_option + separator + (true_value if param else false_value)]
2257 return [command_option, true_value if param else false_value]
2260 def cli_valueless_option(params, command_option, param, expected_value=True):
2261 param = params.get(param)
2262 return [command_option] if param == expected_value else []
2265 def cli_configuration_args(params, param, default=[]):
2266 ex_args = params.get(param)
2269 assert isinstance(ex_args, list)
2273 class ISO639Utils(object):
2274 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2463 def short2long(cls, code):
2464 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2465 return cls._lang_map.get(code[:2])
2468 def long2short(cls, code):
2469 """Convert language code from ISO 639-2/T to ISO 639-1"""
2470 for short_name, long_name in cls._lang_map.items():
2471 if long_name == code:
2475 class ISO3166Utils(object):
2476 # From http://data.okfn.org/data/core/country-list
2478 'AF': 'Afghanistan',
2479 'AX': 'Åland Islands',
2482 'AS': 'American Samoa',
2487 'AG': 'Antigua and Barbuda',
2504 'BO': 'Bolivia, Plurinational State of',
2505 'BQ': 'Bonaire, Sint Eustatius and Saba',
2506 'BA': 'Bosnia and Herzegovina',
2508 'BV': 'Bouvet Island',
2510 'IO': 'British Indian Ocean Territory',
2511 'BN': 'Brunei Darussalam',
2513 'BF': 'Burkina Faso',
2519 'KY': 'Cayman Islands',
2520 'CF': 'Central African Republic',
2524 'CX': 'Christmas Island',
2525 'CC': 'Cocos (Keeling) Islands',
2529 'CD': 'Congo, the Democratic Republic of the',
2530 'CK': 'Cook Islands',
2532 'CI': 'Côte d\'Ivoire',
2537 'CZ': 'Czech Republic',
2541 'DO': 'Dominican Republic',
2544 'SV': 'El Salvador',
2545 'GQ': 'Equatorial Guinea',
2549 'FK': 'Falkland Islands (Malvinas)',
2550 'FO': 'Faroe Islands',
2554 'GF': 'French Guiana',
2555 'PF': 'French Polynesia',
2556 'TF': 'French Southern Territories',
2571 'GW': 'Guinea-Bissau',
2574 'HM': 'Heard Island and McDonald Islands',
2575 'VA': 'Holy See (Vatican City State)',
2582 'IR': 'Iran, Islamic Republic of',
2585 'IM': 'Isle of Man',
2595 'KP': 'Korea, Democratic People\'s Republic of',
2596 'KR': 'Korea, Republic of',
2599 'LA': 'Lao People\'s Democratic Republic',
2605 'LI': 'Liechtenstein',
2609 'MK': 'Macedonia, the Former Yugoslav Republic of',
2616 'MH': 'Marshall Islands',
2622 'FM': 'Micronesia, Federated States of',
2623 'MD': 'Moldova, Republic of',
2634 'NL': 'Netherlands',
2635 'NC': 'New Caledonia',
2636 'NZ': 'New Zealand',
2641 'NF': 'Norfolk Island',
2642 'MP': 'Northern Mariana Islands',
2647 'PS': 'Palestine, State of',
2649 'PG': 'Papua New Guinea',
2652 'PH': 'Philippines',
2656 'PR': 'Puerto Rico',
2660 'RU': 'Russian Federation',
2662 'BL': 'Saint Barthélemy',
2663 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2664 'KN': 'Saint Kitts and Nevis',
2665 'LC': 'Saint Lucia',
2666 'MF': 'Saint Martin (French part)',
2667 'PM': 'Saint Pierre and Miquelon',
2668 'VC': 'Saint Vincent and the Grenadines',
2671 'ST': 'Sao Tome and Principe',
2672 'SA': 'Saudi Arabia',
2676 'SL': 'Sierra Leone',
2678 'SX': 'Sint Maarten (Dutch part)',
2681 'SB': 'Solomon Islands',
2683 'ZA': 'South Africa',
2684 'GS': 'South Georgia and the South Sandwich Islands',
2685 'SS': 'South Sudan',
2690 'SJ': 'Svalbard and Jan Mayen',
2693 'CH': 'Switzerland',
2694 'SY': 'Syrian Arab Republic',
2695 'TW': 'Taiwan, Province of China',
2697 'TZ': 'Tanzania, United Republic of',
2699 'TL': 'Timor-Leste',
2703 'TT': 'Trinidad and Tobago',
2706 'TM': 'Turkmenistan',
2707 'TC': 'Turks and Caicos Islands',
2711 'AE': 'United Arab Emirates',
2712 'GB': 'United Kingdom',
2713 'US': 'United States',
2714 'UM': 'United States Minor Outlying Islands',
2718 'VE': 'Venezuela, Bolivarian Republic of',
2720 'VG': 'Virgin Islands, British',
2721 'VI': 'Virgin Islands, U.S.',
2722 'WF': 'Wallis and Futuna',
2723 'EH': 'Western Sahara',
2730 def short2full(cls, code):
2731 """Convert an ISO 3166-2 country code to the corresponding full name"""
2732 return cls._country_map.get(code.upper())
2735 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2736 def __init__(self, proxies=None):
2737 # Set default handlers
2738 for type in ('http', 'https'):
2739 setattr(self, '%s_open' % type,
2740 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2741 meth(r, proxy, type))
2742 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2744 def proxy_open(self, req, proxy, type):
2745 req_proxy = req.headers.get('Ytdl-request-proxy')
2746 if req_proxy is not None:
2748 del req.headers['Ytdl-request-proxy']
2750 if proxy == '__noproxy__':
2751 return None # No Proxy
2752 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2753 req.add_header('Ytdl-socks-proxy', proxy)
2754 # youtube-dl's http/https handlers do wrapping the socket with socks
2756 return compat_urllib_request.ProxyHandler.proxy_open(
2757 self, req, proxy, type)
2760 def ohdave_rsa_encrypt(data, exponent, modulus):
2762 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2765 data: data to encrypt, bytes-like object
2766 exponent, modulus: parameter e and N of RSA algorithm, both integer
2767 Output: hex string of encrypted data
2769 Limitation: supports one block encryption only
2772 payload = int(binascii.hexlify(data[::-1]), 16)
2773 encrypted = pow(payload, exponent, modulus)
2774 return '%x' % encrypted
2777 def encode_base_n(num, n, table=None):
2778 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2780 table = FULL_TABLE[:n]
2783 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2790 ret = table[num % n] + ret
2795 def decode_packed_codes(code):
2797 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2799 obfucasted_code, base, count, symbols = mobj.groups()
2802 symbols = symbols.split('|')
2807 base_n_count = encode_base_n(count, base)
2808 symbol_table[base_n_count] = symbols[count] or base_n_count
2811 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],