2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
46 compat_socket_create_connection,
51 compat_urllib_parse_urlencode,
52 compat_urllib_parse_urlparse,
53 compat_urllib_parse_unquote_plus,
54 compat_urllib_request,
65 def register_socks_protocols():
66 # "Register" SOCKS protocols
67 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
68 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
69 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
70 if scheme not in compat_urlparse.uses_netloc:
71 compat_urlparse.uses_netloc.append(scheme)
74 # This is not clearly defined otherwise
75 compiled_regex_type = type(re.compile(''))
78 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
79 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
80 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
81 'Accept-Encoding': 'gzip, deflate',
82 'Accept-Language': 'en-us,en;q=0.5',
88 ENGLISH_MONTH_NAMES = [
89 'January', 'February', 'March', 'April', 'May', 'June',
90 'July', 'August', 'September', 'October', 'November', 'December']
93 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
94 'flv', 'f4v', 'f4a', 'f4b',
95 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
105 'f4f', 'f4m', 'm3u8', 'smil')
107 # needed for sanitizing filenames in restricted mode
108 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ',
109 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
110 'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
113 def preferredencoding():
114 """Get preferred encoding.
116 Returns the best encoding scheme for the system, based on
117 locale.getpreferredencoding() and some further tweaks.
120 pref = locale.getpreferredencoding()
128 def write_json_file(obj, fn):
129 """ Encode obj as JSON and write it to fn, atomically if possible """
131 fn = encodeFilename(fn)
132 if sys.version_info < (3, 0) and sys.platform != 'win32':
133 encoding = get_filesystem_encoding()
134 # os.path.basename returns a bytes object, but NamedTemporaryFile
135 # will fail if the filename contains non ascii characters unless we
136 # use a unicode object
137 path_basename = lambda f: os.path.basename(fn).decode(encoding)
138 # the same for os.path.dirname
139 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
141 path_basename = os.path.basename
142 path_dirname = os.path.dirname
146 'prefix': path_basename(fn) + '.',
147 'dir': path_dirname(fn),
151 # In Python 2.x, json.dump expects a bytestream.
152 # In Python 3.x, it writes to a character stream
153 if sys.version_info < (3, 0):
161 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
166 if sys.platform == 'win32':
167 # Need to remove existing file on Windows, else os.rename raises
168 # WindowsError or FileExistsError.
173 os.rename(tf.name, fn)
182 if sys.version_info >= (2, 7):
183 def find_xpath_attr(node, xpath, key, val=None):
184 """ Find the xpath xpath[@key=val] """
185 assert re.match(r'^[a-zA-Z_-]+$', key)
186 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
187 return node.find(expr)
189 def find_xpath_attr(node, xpath, key, val=None):
190 for f in node.findall(compat_xpath(xpath)):
191 if key not in f.attrib:
193 if val is None or f.attrib.get(key) == val:
197 # On python2.6 the xml.etree.ElementTree.Element methods don't support
198 # the namespace parameter
201 def xpath_with_ns(path, ns_map):
202 components = [c.split(':') for c in path.split('/')]
206 replaced.append(c[0])
209 replaced.append('{%s}%s' % (ns_map[ns], tag))
210 return '/'.join(replaced)
213 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
214 def _find_xpath(xpath):
215 return node.find(compat_xpath(xpath))
217 if isinstance(xpath, (str, compat_str)):
218 n = _find_xpath(xpath)
226 if default is not NO_DEFAULT:
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element %s' % name)
236 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
237 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
238 if n is None or n == default:
241 if default is not NO_DEFAULT:
244 name = xpath if name is None else name
245 raise ExtractorError('Could not find XML element\'s text %s' % name)
251 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
252 n = find_xpath_attr(node, xpath, key)
254 if default is not NO_DEFAULT:
257 name = '%s[@%s]' % (xpath, key) if name is None else name
258 raise ExtractorError('Could not find XML attribute %s' % name)
264 def get_element_by_id(id, html):
265 """Return the content of the tag with the specified ID in the passed HTML document"""
266 return get_element_by_attribute('id', id, html)
269 def get_element_by_attribute(attribute, value, html):
270 """Return the content of the tag with the specified attribute in the passed HTML document"""
272 m = re.search(r'''(?xs)
274 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
276 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
280 ''' % (re.escape(attribute), re.escape(value)), html)
284 res = m.group('content')
286 if res.startswith('"') or res.startswith("'"):
289 return unescapeHTML(res)
292 class HTMLAttributeParser(compat_HTMLParser):
293 """Trivial HTML parser to gather the attributes for a single element"""
296 compat_HTMLParser.__init__(self)
298 def handle_starttag(self, tag, attrs):
299 self.attrs = dict(attrs)
302 def extract_attributes(html_element):
303 """Given a string for an HTML element such as
305 a="foo" B="bar" c="&98;az" d=boz
306 empty= noval entity="&"
309 Decode and return a dictionary of attributes.
311 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
312 'empty': '', 'noval': None, 'entity': '&',
313 'sq': '"', 'dq': '\''
315 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
316 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
318 parser = HTMLAttributeParser()
319 parser.feed(html_element)
324 def clean_html(html):
325 """Clean an HTML snippet into a readable string"""
327 if html is None: # Convenience for sanitizing descriptions etc.
331 html = html.replace('\n', ' ')
332 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
333 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
335 html = re.sub('<.*?>', '', html)
336 # Replace html entities
337 html = unescapeHTML(html)
341 def sanitize_open(filename, open_mode):
342 """Try to open the given filename, and slightly tweak it if this fails.
344 Attempts to open the given filename. If this fails, it tries to change
345 the filename slightly, step by step, until it's either able to open it
346 or it fails and raises a final exception, like the standard open()
349 It returns the tuple (stream, definitive_file_name).
353 if sys.platform == 'win32':
355 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
356 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, filename)
359 except (IOError, OSError) as err:
360 if err.errno in (errno.EACCES,):
363 # In case of error, try to remove win32 forbidden chars
364 alt_filename = sanitize_path(filename)
365 if alt_filename == filename:
368 # An exception here should be caught in the caller
369 stream = open(encodeFilename(alt_filename), open_mode)
370 return (stream, alt_filename)
373 def timeconvert(timestr):
374 """Convert RFC 2822 defined time string into system timestamp"""
376 timetuple = email.utils.parsedate_tz(timestr)
377 if timetuple is not None:
378 timestamp = email.utils.mktime_tz(timetuple)
382 def sanitize_filename(s, restricted=False, is_id=False):
383 """Sanitizes a string so it could be used as part of a filename.
384 If restricted is set, use a stricter subset of allowed characters.
385 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
387 def replace_insane(char):
388 if restricted and char in ACCENT_CHARS:
389 return ACCENT_CHARS[char]
390 if char == '?' or ord(char) < 32 or ord(char) == 127:
393 return '' if restricted else '\''
395 return '_-' if restricted else ' -'
396 elif char in '\\/|*<>':
398 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
400 if restricted and ord(char) > 127:
405 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
406 result = ''.join(map(replace_insane, s))
408 while '__' in result:
409 result = result.replace('__', '_')
410 result = result.strip('_')
411 # Common case of "Foreign band name - English song title"
412 if restricted and result.startswith('-_'):
414 if result.startswith('-'):
415 result = '_' + result[len('-'):]
416 result = result.lstrip('.')
422 def sanitize_path(s):
423 """Sanitizes and normalizes path on Windows"""
424 if sys.platform != 'win32':
426 drive_or_unc, _ = os.path.splitdrive(s)
427 if sys.version_info < (2, 7) and not drive_or_unc:
428 drive_or_unc, _ = os.path.splitunc(s)
429 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
433 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
434 for path_part in norm_path]
436 sanitized_path.insert(0, drive_or_unc + os.path.sep)
437 return os.path.join(*sanitized_path)
440 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
441 # unwanted failures due to missing protocol
442 def sanitize_url(url):
443 return 'http:%s' % url if url.startswith('//') else url
446 def sanitized_Request(url, *args, **kwargs):
447 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
450 def orderedSet(iterable):
451 """ Remove all duplicates from the input iterable """
459 def _htmlentity_transform(entity):
460 """Transforms an HTML entity to a character."""
461 # Known non-numeric HTML entity
462 if entity in compat_html_entities.name2codepoint:
463 return compat_chr(compat_html_entities.name2codepoint[entity])
465 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
467 numstr = mobj.group(1)
468 if numstr.startswith('x'):
470 numstr = '0%s' % numstr
473 # See https://github.com/rg3/youtube-dl/issues/7518
475 return compat_chr(int(numstr, base))
479 # Unknown entity in name, return its literal representation
480 return '&%s;' % entity
486 assert type(s) == compat_str
489 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
492 def get_subprocess_encoding():
493 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
494 # For subprocess calls, encode with locale encoding
495 # Refer to http://stackoverflow.com/a/9951851/35070
496 encoding = preferredencoding()
498 encoding = sys.getfilesystemencoding()
504 def encodeFilename(s, for_subprocess=False):
506 @param s The name of the file
509 assert type(s) == compat_str
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
515 # Pass '' directly to use Unicode APIs on Windows 2000 and up
516 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
517 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
518 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
521 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
522 if sys.platform.startswith('java'):
525 return s.encode(get_subprocess_encoding(), 'ignore')
528 def decodeFilename(b, for_subprocess=False):
530 if sys.version_info >= (3, 0):
533 if not isinstance(b, bytes):
536 return b.decode(get_subprocess_encoding(), 'ignore')
539 def encodeArgument(s):
540 if not isinstance(s, compat_str):
541 # Legacy code that uses byte strings
542 # Uncomment the following line after fixing all post processors
543 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
544 s = s.decode('ascii')
545 return encodeFilename(s, True)
548 def decodeArgument(b):
549 return decodeFilename(b, True)
552 def decodeOption(optval):
555 if isinstance(optval, bytes):
556 optval = optval.decode(preferredencoding())
558 assert isinstance(optval, compat_str)
562 def formatSeconds(secs):
564 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
566 return '%d:%02d' % (secs // 60, secs % 60)
571 def make_HTTPS_handler(params, **kwargs):
572 opts_no_check_certificate = params.get('nocheckcertificate', False)
573 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
574 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
575 if opts_no_check_certificate:
576 context.check_hostname = False
577 context.verify_mode = ssl.CERT_NONE
579 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
582 # (create_default_context present but HTTPSHandler has no context=)
585 if sys.version_info < (3, 2):
586 return YoutubeDLHTTPSHandler(params, **kwargs)
588 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
589 context.verify_mode = (ssl.CERT_NONE
590 if opts_no_check_certificate
591 else ssl.CERT_REQUIRED)
592 context.set_default_verify_paths()
593 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
596 def bug_reports_message():
597 if ytdl_is_updateable():
598 update_cmd = 'type youtube-dl -U to update'
600 update_cmd = 'see https://yt-dl.org/update on how to update'
601 msg = '; please report this issue on https://yt-dl.org/bug .'
602 msg += ' Make sure you are using the latest version; %s.' % update_cmd
603 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
607 class ExtractorError(Exception):
608 """Error during info extraction."""
610 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
611 """ tb, if given, is the original traceback (so that it can be printed out).
612 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
615 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
617 if video_id is not None:
618 msg = video_id + ': ' + msg
620 msg += ' (caused by %r)' % cause
622 msg += bug_reports_message()
623 super(ExtractorError, self).__init__(msg)
626 self.exc_info = sys.exc_info() # preserve original exception
628 self.video_id = video_id
630 def format_traceback(self):
631 if self.traceback is None:
633 return ''.join(traceback.format_tb(self.traceback))
636 class UnsupportedError(ExtractorError):
637 def __init__(self, url):
638 super(UnsupportedError, self).__init__(
639 'Unsupported URL: %s' % url, expected=True)
643 class RegexNotFoundError(ExtractorError):
644 """Error when a regex didn't match"""
648 class DownloadError(Exception):
649 """Download Error exception.
651 This exception may be thrown by FileDownloader objects if they are not
652 configured to continue on errors. They will contain the appropriate
656 def __init__(self, msg, exc_info=None):
657 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
658 super(DownloadError, self).__init__(msg)
659 self.exc_info = exc_info
662 class SameFileError(Exception):
663 """Same File exception.
665 This exception will be thrown by FileDownloader objects if they detect
666 multiple files would have to be downloaded to the same file on disk.
671 class PostProcessingError(Exception):
672 """Post Processing exception.
674 This exception may be raised by PostProcessor's .run() method to
675 indicate an error in the postprocessing task.
678 def __init__(self, msg):
682 class MaxDownloadsReached(Exception):
683 """ --max-downloads limit has been reached. """
687 class UnavailableVideoError(Exception):
688 """Unavailable Format exception.
690 This exception will be thrown when a video is requested
691 in a format that is not available for that video.
696 class ContentTooShortError(Exception):
697 """Content Too Short exception.
699 This exception may be raised by FileDownloader objects when a file they
700 download is too small for what the server announced first, indicating
701 the connection was probably interrupted.
704 def __init__(self, downloaded, expected):
706 self.downloaded = downloaded
707 self.expected = expected
710 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
711 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
712 # expected HTTP responses to meet HTTP/1.0 or later (see also
713 # https://github.com/rg3/youtube-dl/issues/6727)
714 if sys.version_info < (3, 0):
715 kwargs[b'strict'] = True
716 hc = http_class(*args, **kwargs)
717 source_address = ydl_handler._params.get('source_address')
718 if source_address is not None:
719 sa = (source_address, 0)
720 if hasattr(hc, 'source_address'): # Python 2.7+
721 hc.source_address = sa
723 def _hc_connect(self, *args, **kwargs):
724 sock = compat_socket_create_connection(
725 (self.host, self.port), self.timeout, sa)
727 self.sock = ssl.wrap_socket(
728 sock, self.key_file, self.cert_file,
729 ssl_version=ssl.PROTOCOL_TLSv1)
732 hc.connect = functools.partial(_hc_connect, hc)
737 def handle_youtubedl_headers(headers):
738 filtered_headers = headers
740 if 'Youtubedl-no-compression' in filtered_headers:
741 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
742 del filtered_headers['Youtubedl-no-compression']
744 return filtered_headers
747 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
748 """Handler for HTTP requests and responses.
750 This class, when installed with an OpenerDirector, automatically adds
751 the standard headers to every HTTP request and handles gzipped and
752 deflated responses from web servers. If compression is to be avoided in
753 a particular request, the original request in the program code only has
754 to include the HTTP header "Youtubedl-no-compression", which will be
755 removed before making the real request.
757 Part of this code was copied from:
759 http://techknack.net/python-urllib2-handlers/
761 Andrew Rowls, the author of that code, agreed to release it to the
765 def __init__(self, params, *args, **kwargs):
766 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
767 self._params = params
769 def http_open(self, req):
770 conn_class = compat_http_client.HTTPConnection
772 socks_proxy = req.headers.get('Ytdl-socks-proxy')
774 conn_class = make_socks_conn_class(conn_class, socks_proxy)
775 del req.headers['Ytdl-socks-proxy']
777 return self.do_open(functools.partial(
778 _create_http_connection, self, conn_class, False),
784 return zlib.decompress(data, -zlib.MAX_WBITS)
786 return zlib.decompress(data)
789 def addinfourl_wrapper(stream, headers, url, code):
790 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
791 return compat_urllib_request.addinfourl(stream, headers, url, code)
792 ret = compat_urllib_request.addinfourl(stream, headers, url)
796 def http_request(self, req):
797 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
798 # always respected by websites, some tend to give out URLs with non percent-encoded
799 # non-ASCII characters (see telemb.py, ard.py [#3412])
800 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
801 # To work around aforementioned issue we will replace request's original URL with
802 # percent-encoded one
803 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
804 # the code of this workaround has been moved here from YoutubeDL.urlopen()
805 url = req.get_full_url()
806 url_escaped = escape_url(url)
808 # Substitute URL if any change after escaping
809 if url != url_escaped:
810 req = update_Request(req, url=url_escaped)
812 for h, v in std_headers.items():
813 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
814 # The dict keys are capitalized because of this bug by urllib
815 if h.capitalize() not in req.headers:
818 req.headers = handle_youtubedl_headers(req.headers)
820 if sys.version_info < (2, 7) and '#' in req.get_full_url():
821 # Python 2.6 is brain-dead when it comes to fragments
822 req._Request__original = req._Request__original.partition('#')[0]
823 req._Request__r_type = req._Request__r_type.partition('#')[0]
827 def http_response(self, req, resp):
830 if resp.headers.get('Content-encoding', '') == 'gzip':
831 content = resp.read()
832 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
834 uncompressed = io.BytesIO(gz.read())
835 except IOError as original_ioerror:
836 # There may be junk add the end of the file
837 # See http://stackoverflow.com/q/4928560/35070 for details
838 for i in range(1, 1024):
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
841 uncompressed = io.BytesIO(gz.read())
846 raise original_ioerror
847 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
848 resp.msg = old_resp.msg
849 del resp.headers['Content-encoding']
851 if resp.headers.get('Content-encoding', '') == 'deflate':
852 gz = io.BytesIO(self.deflate(resp.read()))
853 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
854 resp.msg = old_resp.msg
855 del resp.headers['Content-encoding']
856 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
857 # https://github.com/rg3/youtube-dl/issues/6457).
858 if 300 <= resp.code < 400:
859 location = resp.headers.get('Location')
861 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
862 if sys.version_info >= (3, 0):
863 location = location.encode('iso-8859-1').decode('utf-8')
864 location_escaped = escape_url(location)
865 if location != location_escaped:
866 del resp.headers['Location']
867 resp.headers['Location'] = location_escaped
870 https_request = http_request
871 https_response = http_response
874 def make_socks_conn_class(base_class, socks_proxy):
875 assert issubclass(base_class, (
876 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
878 url_components = compat_urlparse.urlparse(socks_proxy)
879 if url_components.scheme.lower() == 'socks5':
880 socks_type = ProxyType.SOCKS5
881 elif url_components.scheme.lower() in ('socks', 'socks4'):
882 socks_type = ProxyType.SOCKS4
883 elif url_components.scheme.lower() == 'socks4a':
884 socks_type = ProxyType.SOCKS4A
886 def unquote_if_non_empty(s):
889 return compat_urllib_parse_unquote_plus(s)
893 url_components.hostname, url_components.port or 1080,
895 unquote_if_non_empty(url_components.username),
896 unquote_if_non_empty(url_components.password),
899 class SocksConnection(base_class):
901 self.sock = sockssocket()
902 self.sock.setproxy(*proxy_args)
903 if type(self.timeout) in (int, float):
904 self.sock.settimeout(self.timeout)
905 self.sock.connect((self.host, self.port))
907 if isinstance(self, compat_http_client.HTTPSConnection):
908 if hasattr(self, '_context'): # Python > 2.6
909 self.sock = self._context.wrap_socket(
910 self.sock, server_hostname=self.host)
912 self.sock = ssl.wrap_socket(self.sock)
914 return SocksConnection
917 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
918 def __init__(self, params, https_conn_class=None, *args, **kwargs):
919 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
920 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
921 self._params = params
923 def https_open(self, req):
925 conn_class = self._https_conn_class
927 if hasattr(self, '_context'): # python > 2.6
928 kwargs['context'] = self._context
929 if hasattr(self, '_check_hostname'): # python 3.x
930 kwargs['check_hostname'] = self._check_hostname
932 socks_proxy = req.headers.get('Ytdl-socks-proxy')
934 conn_class = make_socks_conn_class(conn_class, socks_proxy)
935 del req.headers['Ytdl-socks-proxy']
937 return self.do_open(functools.partial(
938 _create_http_connection, self, conn_class, True),
942 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
943 def __init__(self, cookiejar=None):
944 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
946 def http_response(self, request, response):
947 # Python 2 will choke on next HTTP request in row if there are non-ASCII
948 # characters in Set-Cookie HTTP header of last response (see
949 # https://github.com/rg3/youtube-dl/issues/6769).
950 # In order to at least prevent crashing we will percent encode Set-Cookie
951 # header before HTTPCookieProcessor starts processing it.
952 # if sys.version_info < (3, 0) and response.headers:
953 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
954 # set_cookie = response.headers.get(set_cookie_header)
956 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
957 # if set_cookie != set_cookie_escaped:
958 # del response.headers[set_cookie_header]
959 # response.headers[set_cookie_header] = set_cookie_escaped
960 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
962 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
963 https_response = http_response
966 def parse_iso8601(date_str, delimiter='T', timezone=None):
967 """ Return a UNIX timestamp from the given date """
972 date_str = re.sub(r'\.[0-9]+', '', date_str)
976 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
979 timezone = datetime.timedelta()
981 date_str = date_str[:-len(m.group(0))]
982 if not m.group('sign'):
983 timezone = datetime.timedelta()
985 sign = 1 if m.group('sign') == '+' else -1
986 timezone = datetime.timedelta(
987 hours=sign * int(m.group('hours')),
988 minutes=sign * int(m.group('minutes')))
990 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
991 dt = datetime.datetime.strptime(date_str, date_format) - timezone
992 return calendar.timegm(dt.timetuple())
997 def unified_strdate(date_str, day_first=True):
998 """Return a string with the date in the format YYYYMMDD"""
1000 if date_str is None:
1004 date_str = date_str.replace(',', ' ')
1005 # %z (UTC offset) is only supported in python>=3.2
1006 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1007 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1008 # Remove AM/PM + timezone
1009 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1011 format_expressions = [
1022 '%Y/%m/%d %H:%M:%S',
1023 '%Y-%m-%d %H:%M:%S',
1024 '%Y-%m-%d %H:%M:%S.%f',
1027 '%Y-%m-%dT%H:%M:%SZ',
1028 '%Y-%m-%dT%H:%M:%S.%fZ',
1029 '%Y-%m-%dT%H:%M:%S.%f0Z',
1030 '%Y-%m-%dT%H:%M:%S',
1031 '%Y-%m-%dT%H:%M:%S.%f',
1035 format_expressions.extend([
1040 '%d/%m/%Y %H:%M:%S',
1043 format_expressions.extend([
1048 '%m/%d/%Y %H:%M:%S',
1050 for expression in format_expressions:
1052 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1055 if upload_date is None:
1056 timetuple = email.utils.parsedate_tz(date_str)
1058 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1059 if upload_date is not None:
1060 return compat_str(upload_date)
1063 def determine_ext(url, default_ext='unknown_video'):
1066 guess = url.partition('?')[0].rpartition('.')[2]
1067 if re.match(r'^[A-Za-z0-9]+$', guess):
1069 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1070 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1071 return guess.rstrip('/')
1076 def subtitles_filename(filename, sub_lang, sub_format):
1077 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1080 def date_from_str(date_str):
1082 Return a datetime object from a string in the format YYYYMMDD or
1083 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1084 today = datetime.date.today()
1085 if date_str in ('now', 'today'):
1087 if date_str == 'yesterday':
1088 return today - datetime.timedelta(days=1)
1089 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1090 if match is not None:
1091 sign = match.group('sign')
1092 time = int(match.group('time'))
1095 unit = match.group('unit')
1096 # A bad approximation?
1100 elif unit == 'year':
1104 delta = datetime.timedelta(**{unit: time})
1105 return today + delta
1106 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1109 def hyphenate_date(date_str):
1111 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1112 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1113 if match is not None:
1114 return '-'.join(match.groups())
1119 class DateRange(object):
1120 """Represents a time interval between two dates"""
1122 def __init__(self, start=None, end=None):
1123 """start and end must be strings in the format accepted by date"""
1124 if start is not None:
1125 self.start = date_from_str(start)
1127 self.start = datetime.datetime.min.date()
1129 self.end = date_from_str(end)
1131 self.end = datetime.datetime.max.date()
1132 if self.start > self.end:
1133 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1137 """Returns a range that only contains the given day"""
1138 return cls(day, day)
1140 def __contains__(self, date):
1141 """Check if the date is in the range"""
1142 if not isinstance(date, datetime.date):
1143 date = date_from_str(date)
1144 return self.start <= date <= self.end
1147 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1150 def platform_name():
1151 """ Returns the platform name as a compat_str """
1152 res = platform.platform()
1153 if isinstance(res, bytes):
1154 res = res.decode(preferredencoding())
1156 assert isinstance(res, compat_str)
1160 def _windows_write_string(s, out):
1161 """ Returns True if the string was written using special methods,
1162 False if it has yet to be written out."""
1163 # Adapted from http://stackoverflow.com/a/3259271/35070
1166 import ctypes.wintypes
1174 fileno = out.fileno()
1175 except AttributeError:
1176 # If the output stream doesn't have a fileno, it's virtual
1178 except io.UnsupportedOperation:
1179 # Some strange Windows pseudo files?
1181 if fileno not in WIN_OUTPUT_IDS:
1184 GetStdHandle = ctypes.WINFUNCTYPE(
1185 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1186 (b'GetStdHandle', ctypes.windll.kernel32))
1187 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1189 WriteConsoleW = ctypes.WINFUNCTYPE(
1190 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1191 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1192 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1193 written = ctypes.wintypes.DWORD(0)
1195 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1196 FILE_TYPE_CHAR = 0x0002
1197 FILE_TYPE_REMOTE = 0x8000
1198 GetConsoleMode = ctypes.WINFUNCTYPE(
1199 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1200 ctypes.POINTER(ctypes.wintypes.DWORD))(
1201 (b'GetConsoleMode', ctypes.windll.kernel32))
1202 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1204 def not_a_console(handle):
1205 if handle == INVALID_HANDLE_VALUE or handle is None:
1207 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1208 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1210 if not_a_console(h):
1213 def next_nonbmp_pos(s):
1215 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1216 except StopIteration:
1220 count = min(next_nonbmp_pos(s), 1024)
1222 ret = WriteConsoleW(
1223 h, s, count if count else 2, ctypes.byref(written), None)
1225 raise OSError('Failed to write string')
1226 if not count: # We just wrote a non-BMP character
1227 assert written.value == 2
1230 assert written.value > 0
1231 s = s[written.value:]
1235 def write_string(s, out=None, encoding=None):
1238 assert type(s) == compat_str
1240 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1241 if _windows_write_string(s, out):
1244 if ('b' in getattr(out, 'mode', '') or
1245 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1246 byt = s.encode(encoding or preferredencoding(), 'ignore')
1248 elif hasattr(out, 'buffer'):
1249 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1250 byt = s.encode(enc, 'ignore')
1251 out.buffer.write(byt)
1257 def bytes_to_intlist(bs):
1260 if isinstance(bs[0], int): # Python 3
1263 return [ord(c) for c in bs]
1266 def intlist_to_bytes(xs):
1269 return compat_struct_pack('%dB' % len(xs), *xs)
1272 # Cross-platform file locking
1273 if sys.platform == 'win32':
1274 import ctypes.wintypes
1277 class OVERLAPPED(ctypes.Structure):
1279 ('Internal', ctypes.wintypes.LPVOID),
1280 ('InternalHigh', ctypes.wintypes.LPVOID),
1281 ('Offset', ctypes.wintypes.DWORD),
1282 ('OffsetHigh', ctypes.wintypes.DWORD),
1283 ('hEvent', ctypes.wintypes.HANDLE),
1286 kernel32 = ctypes.windll.kernel32
1287 LockFileEx = kernel32.LockFileEx
1288 LockFileEx.argtypes = [
1289 ctypes.wintypes.HANDLE, # hFile
1290 ctypes.wintypes.DWORD, # dwFlags
1291 ctypes.wintypes.DWORD, # dwReserved
1292 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1293 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1294 ctypes.POINTER(OVERLAPPED) # Overlapped
1296 LockFileEx.restype = ctypes.wintypes.BOOL
1297 UnlockFileEx = kernel32.UnlockFileEx
1298 UnlockFileEx.argtypes = [
1299 ctypes.wintypes.HANDLE, # hFile
1300 ctypes.wintypes.DWORD, # dwReserved
1301 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1302 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1303 ctypes.POINTER(OVERLAPPED) # Overlapped
1305 UnlockFileEx.restype = ctypes.wintypes.BOOL
1306 whole_low = 0xffffffff
1307 whole_high = 0x7fffffff
1309 def _lock_file(f, exclusive):
1310 overlapped = OVERLAPPED()
1311 overlapped.Offset = 0
1312 overlapped.OffsetHigh = 0
1313 overlapped.hEvent = 0
1314 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1315 handle = msvcrt.get_osfhandle(f.fileno())
1316 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1317 whole_low, whole_high, f._lock_file_overlapped_p):
1318 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1320 def _unlock_file(f):
1321 assert f._lock_file_overlapped_p
1322 handle = msvcrt.get_osfhandle(f.fileno())
1323 if not UnlockFileEx(handle, 0,
1324 whole_low, whole_high, f._lock_file_overlapped_p):
1325 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1328 # Some platforms, such as Jython, is missing fcntl
1332 def _lock_file(f, exclusive):
1333 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1335 def _unlock_file(f):
1336 fcntl.flock(f, fcntl.LOCK_UN)
1338 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1340 def _lock_file(f, exclusive):
1341 raise IOError(UNSUPPORTED_MSG)
1343 def _unlock_file(f):
1344 raise IOError(UNSUPPORTED_MSG)
1347 class locked_file(object):
1348 def __init__(self, filename, mode, encoding=None):
1349 assert mode in ['r', 'a', 'w']
1350 self.f = io.open(filename, mode, encoding=encoding)
1353 def __enter__(self):
1354 exclusive = self.mode != 'r'
1356 _lock_file(self.f, exclusive)
1362 def __exit__(self, etype, value, traceback):
1364 _unlock_file(self.f)
1371 def write(self, *args):
1372 return self.f.write(*args)
1374 def read(self, *args):
1375 return self.f.read(*args)
1378 def get_filesystem_encoding():
1379 encoding = sys.getfilesystemencoding()
1380 return encoding if encoding is not None else 'utf-8'
1383 def shell_quote(args):
1385 encoding = get_filesystem_encoding()
1387 if isinstance(a, bytes):
1388 # We may get a filename encoded with 'encodeFilename'
1389 a = a.decode(encoding)
1390 quoted_args.append(pipes.quote(a))
1391 return ' '.join(quoted_args)
1394 def smuggle_url(url, data):
1395 """ Pass additional data in a URL for internal use. """
1397 sdata = compat_urllib_parse_urlencode(
1398 {'__youtubedl_smuggle': json.dumps(data)})
1399 return url + '#' + sdata
1402 def unsmuggle_url(smug_url, default=None):
1403 if '#__youtubedl_smuggle' not in smug_url:
1404 return smug_url, default
1405 url, _, sdata = smug_url.rpartition('#')
1406 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1407 data = json.loads(jsond)
1411 def format_bytes(bytes):
1414 if type(bytes) is str:
1415 bytes = float(bytes)
1419 exponent = int(math.log(bytes, 1024.0))
1420 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1421 converted = float(bytes) / float(1024 ** exponent)
1422 return '%.2f%s' % (converted, suffix)
1425 def lookup_unit_table(unit_table, s):
1426 units_re = '|'.join(re.escape(u) for u in unit_table)
1428 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1431 num_str = m.group('num').replace(',', '.')
1432 mult = unit_table[m.group('unit')]
1433 return int(float(num_str) * mult)
1436 def parse_filesize(s):
1440 # The lower-case forms are of course incorrect and unofficial,
1441 # but we support those too
1479 return lookup_unit_table(_UNIT_TABLE, s)
1488 if re.match(r'^[\d,.]+$', s):
1489 return str_to_int(s)
1500 return lookup_unit_table(_UNIT_TABLE, s)
1503 def month_by_name(name):
1504 """ Return the number of a month by (locale-independently) English name """
1507 return ENGLISH_MONTH_NAMES.index(name) + 1
1512 def month_by_abbreviation(abbrev):
1513 """ Return the number of a month by (locale-independently) English
1517 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1522 def fix_xml_ampersands(xml_str):
1523 """Replace all the '&' by '&' in XML"""
1525 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1530 def setproctitle(title):
1531 assert isinstance(title, compat_str)
1533 # ctypes in Jython is not complete
1534 # http://bugs.jython.org/issue2148
1535 if sys.platform.startswith('java'):
1539 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1542 title_bytes = title.encode('utf-8')
1543 buf = ctypes.create_string_buffer(len(title_bytes))
1544 buf.value = title_bytes
1546 libc.prctl(15, buf, 0, 0, 0)
1547 except AttributeError:
1548 return # Strange libc, just skip this
1551 def remove_start(s, start):
1552 return s[len(start):] if s is not None and s.startswith(start) else s
1555 def remove_end(s, end):
1556 return s[:-len(end)] if s is not None and s.endswith(end) else s
1559 def remove_quotes(s):
1560 if s is None or len(s) < 2:
1562 for quote in ('"', "'", ):
1563 if s[0] == quote and s[-1] == quote:
1568 def url_basename(url):
1569 path = compat_urlparse.urlparse(url).path
1570 return path.strip('/').split('/')[-1]
1573 class HEADRequest(compat_urllib_request.Request):
1574 def get_method(self):
1578 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1581 v = getattr(v, get_attr, None)
1587 return int(v) * invscale // scale
1592 def str_or_none(v, default=None):
1593 return default if v is None else compat_str(v)
1596 def str_to_int(int_str):
1597 """ A more relaxed version of int_or_none """
1600 int_str = re.sub(r'[,\.\+]', '', int_str)
1604 def float_or_none(v, scale=1, invscale=1, default=None):
1608 return float(v) * invscale / scale
1613 def parse_duration(s):
1614 if not isinstance(s, compat_basestring):
1619 days, hours, mins, secs, ms = [None] * 5
1620 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1622 days, hours, mins, secs, ms = m.groups()
1627 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1630 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1633 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1636 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1639 days, hours, mins, secs, ms = m.groups()
1641 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1643 hours, mins = m.groups()
1649 duration += float(secs)
1651 duration += float(mins) * 60
1653 duration += float(hours) * 60 * 60
1655 duration += float(days) * 24 * 60 * 60
1657 duration += float(ms)
1661 def prepend_extension(filename, ext, expected_real_ext=None):
1662 name, real_ext = os.path.splitext(filename)
1664 '{0}.{1}{2}'.format(name, ext, real_ext)
1665 if not expected_real_ext or real_ext[1:] == expected_real_ext
1666 else '{0}.{1}'.format(filename, ext))
1669 def replace_extension(filename, ext, expected_real_ext=None):
1670 name, real_ext = os.path.splitext(filename)
1671 return '{0}.{1}'.format(
1672 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1676 def check_executable(exe, args=[]):
1677 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1678 args can be a list of arguments for a short output (like -version) """
1680 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1686 def get_exe_version(exe, args=['--version'],
1687 version_re=None, unrecognized='present'):
1688 """ Returns the version of the specified executable,
1689 or False if the executable is not present """
1691 out, _ = subprocess.Popen(
1692 [encodeArgument(exe)] + args,
1693 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1696 if isinstance(out, bytes): # Python 2.x
1697 out = out.decode('ascii', 'ignore')
1698 return detect_exe_version(out, version_re, unrecognized)
1701 def detect_exe_version(output, version_re=None, unrecognized='present'):
1702 assert isinstance(output, compat_str)
1703 if version_re is None:
1704 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1705 m = re.search(version_re, output)
1712 class PagedList(object):
1714 # This is only useful for tests
1715 return len(self.getslice())
1718 class OnDemandPagedList(PagedList):
1719 def __init__(self, pagefunc, pagesize, use_cache=False):
1720 self._pagefunc = pagefunc
1721 self._pagesize = pagesize
1722 self._use_cache = use_cache
1726 def getslice(self, start=0, end=None):
1728 for pagenum in itertools.count(start // self._pagesize):
1729 firstid = pagenum * self._pagesize
1730 nextfirstid = pagenum * self._pagesize + self._pagesize
1731 if start >= nextfirstid:
1736 page_results = self._cache.get(pagenum)
1737 if page_results is None:
1738 page_results = list(self._pagefunc(pagenum))
1740 self._cache[pagenum] = page_results
1743 start % self._pagesize
1744 if firstid <= start < nextfirstid
1748 ((end - 1) % self._pagesize) + 1
1749 if (end is not None and firstid <= end <= nextfirstid)
1752 if startv != 0 or endv is not None:
1753 page_results = page_results[startv:endv]
1754 res.extend(page_results)
1756 # A little optimization - if current page is not "full", ie. does
1757 # not contain page_size videos then we can assume that this page
1758 # is the last one - there are no more ids on further pages -
1759 # i.e. no need to query again.
1760 if len(page_results) + startv < self._pagesize:
1763 # If we got the whole page, but the next page is not interesting,
1764 # break out early as well
1765 if end == nextfirstid:
1770 class InAdvancePagedList(PagedList):
1771 def __init__(self, pagefunc, pagecount, pagesize):
1772 self._pagefunc = pagefunc
1773 self._pagecount = pagecount
1774 self._pagesize = pagesize
1776 def getslice(self, start=0, end=None):
1778 start_page = start // self._pagesize
1780 self._pagecount if end is None else (end // self._pagesize + 1))
1781 skip_elems = start - start_page * self._pagesize
1782 only_more = None if end is None else end - start
1783 for pagenum in range(start_page, end_page):
1784 page = list(self._pagefunc(pagenum))
1786 page = page[skip_elems:]
1788 if only_more is not None:
1789 if len(page) < only_more:
1790 only_more -= len(page)
1792 page = page[:only_more]
1799 def uppercase_escape(s):
1800 unicode_escape = codecs.getdecoder('unicode_escape')
1802 r'\\U[0-9a-fA-F]{8}',
1803 lambda m: unicode_escape(m.group(0))[0],
1807 def lowercase_escape(s):
1808 unicode_escape = codecs.getdecoder('unicode_escape')
1810 r'\\u[0-9a-fA-F]{4}',
1811 lambda m: unicode_escape(m.group(0))[0],
1815 def escape_rfc3986(s):
1816 """Escape non-ASCII characters as suggested by RFC 3986"""
1817 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1818 s = s.encode('utf-8')
1819 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1822 def escape_url(url):
1823 """Escape URL as suggested by RFC 3986"""
1824 url_parsed = compat_urllib_parse_urlparse(url)
1825 return url_parsed._replace(
1826 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1827 path=escape_rfc3986(url_parsed.path),
1828 params=escape_rfc3986(url_parsed.params),
1829 query=escape_rfc3986(url_parsed.query),
1830 fragment=escape_rfc3986(url_parsed.fragment)
1834 def read_batch_urls(batch_fd):
1836 if not isinstance(url, compat_str):
1837 url = url.decode('utf-8', 'replace')
1838 BOM_UTF8 = '\xef\xbb\xbf'
1839 if url.startswith(BOM_UTF8):
1840 url = url[len(BOM_UTF8):]
1842 if url.startswith(('#', ';', ']')):
1846 with contextlib.closing(batch_fd) as fd:
1847 return [url for url in map(fixup, fd) if url]
1850 def urlencode_postdata(*args, **kargs):
1851 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1854 def update_url_query(url, query):
1857 parsed_url = compat_urlparse.urlparse(url)
1858 qs = compat_parse_qs(parsed_url.query)
1860 return compat_urlparse.urlunparse(parsed_url._replace(
1861 query=compat_urllib_parse_urlencode(qs, True)))
1864 def update_Request(req, url=None, data=None, headers={}, query={}):
1865 req_headers = req.headers.copy()
1866 req_headers.update(headers)
1867 req_data = data or req.data
1868 req_url = update_url_query(url or req.get_full_url(), query)
1869 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1871 req_url, data=req_data, headers=req_headers,
1872 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1873 if hasattr(req, 'timeout'):
1874 new_req.timeout = req.timeout
1878 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1879 if isinstance(key_or_keys, (list, tuple)):
1880 for key in key_or_keys:
1881 if key not in d or d[key] is None or skip_false_values and not d[key]:
1885 return d.get(key_or_keys, default)
1888 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1889 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1901 def parse_age_limit(s):
1904 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1905 return int(m.group('age')) if m else US_RATINGS.get(s)
1908 def strip_jsonp(code):
1910 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1913 def js_to_json(code):
1916 if v in ('true', 'false', 'null'):
1918 elif v.startswith('/*') or v == ',':
1921 if v[0] in ("'", '"'):
1922 v = re.sub(r'(?s)\\.|"', lambda m: {
1927 }.get(m.group(0), m.group(0)), v[1:-1])
1930 (r'^0[xX][0-9a-fA-F]+', 16),
1934 for regex, base in INTEGER_TABLE:
1935 im = re.match(regex, v)
1937 i = int(im.group(0), base)
1938 return '"%d":' % i if v.endswith(':') else '%d' % i
1942 return re.sub(r'''(?sx)
1943 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1944 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1945 /\*.*?\*/|,(?=\s*[\]}])|
1946 [a-zA-Z_][.a-zA-Z_0-9]*|
1947 (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
1952 def qualities(quality_ids):
1953 """ Get a numeric quality value out of a list of possible values """
1956 return quality_ids.index(qid)
1962 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1965 def limit_length(s, length):
1966 """ Add ellipses to overly long strings """
1971 return s[:length - len(ELLIPSES)] + ELLIPSES
1975 def version_tuple(v):
1976 return tuple(int(e) for e in re.split(r'[-.]', v))
1979 def is_outdated_version(version, limit, assume_new=True):
1981 return not assume_new
1983 return version_tuple(version) < version_tuple(limit)
1985 return not assume_new
1988 def ytdl_is_updateable():
1989 """ Returns if youtube-dl can be updated with -U """
1990 from zipimport import zipimporter
1992 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1995 def args_to_str(args):
1996 # Get a short string representation for a subprocess command
1997 return ' '.join(compat_shlex_quote(a) for a in args)
2000 def error_to_compat_str(err):
2002 # On python 2 error byte string must be decoded with proper
2003 # encoding rather than ascii
2004 if sys.version_info[0] < 3:
2005 err_str = err_str.decode(preferredencoding())
2009 def mimetype2ext(mt):
2019 _, _, res = mt.rpartition('/')
2023 'smptett+xml': 'tt',
2029 'x-mp4-fragmented': 'mp4',
2034 def urlhandle_detect_ext(url_handle):
2035 getheader = url_handle.headers.get
2037 cd = getheader('Content-Disposition')
2039 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2041 e = determine_ext(m.group('filename'), default_ext=None)
2045 return mimetype2ext(getheader('Content-Type'))
2048 def encode_data_uri(data, mime_type):
2049 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2052 def age_restricted(content_limit, age_limit):
2053 """ Returns True iff the content should be blocked """
2055 if age_limit is None: # No limit set
2057 if content_limit is None:
2058 return False # Content available for everyone
2059 return age_limit < content_limit
2062 def is_html(first_bytes):
2063 """ Detect whether a file contains HTML by examining its first bytes. """
2066 (b'\xef\xbb\xbf', 'utf-8'),
2067 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2068 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2069 (b'\xff\xfe', 'utf-16-le'),
2070 (b'\xfe\xff', 'utf-16-be'),
2072 for bom, enc in BOMS:
2073 if first_bytes.startswith(bom):
2074 s = first_bytes[len(bom):].decode(enc, 'replace')
2077 s = first_bytes.decode('utf-8', 'replace')
2079 return re.match(r'^\s*<', s)
2082 def determine_protocol(info_dict):
2083 protocol = info_dict.get('protocol')
2084 if protocol is not None:
2087 url = info_dict['url']
2088 if url.startswith('rtmp'):
2090 elif url.startswith('mms'):
2092 elif url.startswith('rtsp'):
2095 ext = determine_ext(url)
2101 return compat_urllib_parse_urlparse(url).scheme
2104 def render_table(header_row, data):
2105 """ Render a list of rows, each as a list of values """
2106 table = [header_row] + data
2107 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2108 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2109 return '\n'.join(format_str % tuple(row) for row in table)
2112 def _match_one(filter_part, dct):
2113 COMPARISON_OPERATORS = {
2121 operator_rex = re.compile(r'''(?x)\s*
2123 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2125 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2126 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2129 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2130 m = operator_rex.search(filter_part)
2132 op = COMPARISON_OPERATORS[m.group('op')]
2133 if m.group('strval') is not None:
2134 if m.group('op') not in ('=', '!='):
2136 'Operator %s does not support string values!' % m.group('op'))
2137 comparison_value = m.group('strval')
2140 comparison_value = int(m.group('intval'))
2142 comparison_value = parse_filesize(m.group('intval'))
2143 if comparison_value is None:
2144 comparison_value = parse_filesize(m.group('intval') + 'B')
2145 if comparison_value is None:
2147 'Invalid integer value %r in filter part %r' % (
2148 m.group('intval'), filter_part))
2149 actual_value = dct.get(m.group('key'))
2150 if actual_value is None:
2151 return m.group('none_inclusive')
2152 return op(actual_value, comparison_value)
2155 '': lambda v: v is not None,
2156 '!': lambda v: v is None,
2158 operator_rex = re.compile(r'''(?x)\s*
2159 (?P<op>%s)\s*(?P<key>[a-z_]+)
2161 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2162 m = operator_rex.search(filter_part)
2164 op = UNARY_OPERATORS[m.group('op')]
2165 actual_value = dct.get(m.group('key'))
2166 return op(actual_value)
2168 raise ValueError('Invalid filter part %r' % filter_part)
2171 def match_str(filter_str, dct):
2172 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2175 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2178 def match_filter_func(filter_str):
2179 def _match_func(info_dict):
2180 if match_str(filter_str, info_dict):
2183 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2184 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2188 def parse_dfxp_time_expr(time_expr):
2192 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2194 return float(mobj.group('time_offset'))
2196 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2198 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2201 def srt_subtitles_timecode(seconds):
2202 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2205 def dfxp2srt(dfxp_data):
2206 _x = functools.partial(xpath_with_ns, ns_map={
2207 'ttml': 'http://www.w3.org/ns/ttml',
2208 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2209 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2212 class TTMLPElementParser(object):
2215 def start(self, tag, attrib):
2216 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2222 def data(self, data):
2226 return self.out.strip()
2228 def parse_node(node):
2229 target = TTMLPElementParser()
2230 parser = xml.etree.ElementTree.XMLParser(target=target)
2231 parser.feed(xml.etree.ElementTree.tostring(node))
2232 return parser.close()
2234 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2236 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2239 raise ValueError('Invalid dfxp/TTML subtitle')
2241 for para, index in zip(paras, itertools.count(1)):
2242 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2243 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2244 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2245 if begin_time is None:
2250 end_time = begin_time + dur
2251 out.append('%d\n%s --> %s\n%s\n\n' % (
2253 srt_subtitles_timecode(begin_time),
2254 srt_subtitles_timecode(end_time),
2260 def cli_option(params, command_option, param):
2261 param = params.get(param)
2262 return [command_option, param] if param is not None else []
2265 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2266 param = params.get(param)
2267 assert isinstance(param, bool)
2269 return [command_option + separator + (true_value if param else false_value)]
2270 return [command_option, true_value if param else false_value]
2273 def cli_valueless_option(params, command_option, param, expected_value=True):
2274 param = params.get(param)
2275 return [command_option] if param == expected_value else []
2278 def cli_configuration_args(params, param, default=[]):
2279 ex_args = params.get(param)
2282 assert isinstance(ex_args, list)
2286 class ISO639Utils(object):
2287 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2476 def short2long(cls, code):
2477 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2478 return cls._lang_map.get(code[:2])
2481 def long2short(cls, code):
2482 """Convert language code from ISO 639-2/T to ISO 639-1"""
2483 for short_name, long_name in cls._lang_map.items():
2484 if long_name == code:
2488 class ISO3166Utils(object):
2489 # From http://data.okfn.org/data/core/country-list
2491 'AF': 'Afghanistan',
2492 'AX': 'Åland Islands',
2495 'AS': 'American Samoa',
2500 'AG': 'Antigua and Barbuda',
2517 'BO': 'Bolivia, Plurinational State of',
2518 'BQ': 'Bonaire, Sint Eustatius and Saba',
2519 'BA': 'Bosnia and Herzegovina',
2521 'BV': 'Bouvet Island',
2523 'IO': 'British Indian Ocean Territory',
2524 'BN': 'Brunei Darussalam',
2526 'BF': 'Burkina Faso',
2532 'KY': 'Cayman Islands',
2533 'CF': 'Central African Republic',
2537 'CX': 'Christmas Island',
2538 'CC': 'Cocos (Keeling) Islands',
2542 'CD': 'Congo, the Democratic Republic of the',
2543 'CK': 'Cook Islands',
2545 'CI': 'Côte d\'Ivoire',
2550 'CZ': 'Czech Republic',
2554 'DO': 'Dominican Republic',
2557 'SV': 'El Salvador',
2558 'GQ': 'Equatorial Guinea',
2562 'FK': 'Falkland Islands (Malvinas)',
2563 'FO': 'Faroe Islands',
2567 'GF': 'French Guiana',
2568 'PF': 'French Polynesia',
2569 'TF': 'French Southern Territories',
2584 'GW': 'Guinea-Bissau',
2587 'HM': 'Heard Island and McDonald Islands',
2588 'VA': 'Holy See (Vatican City State)',
2595 'IR': 'Iran, Islamic Republic of',
2598 'IM': 'Isle of Man',
2608 'KP': 'Korea, Democratic People\'s Republic of',
2609 'KR': 'Korea, Republic of',
2612 'LA': 'Lao People\'s Democratic Republic',
2618 'LI': 'Liechtenstein',
2622 'MK': 'Macedonia, the Former Yugoslav Republic of',
2629 'MH': 'Marshall Islands',
2635 'FM': 'Micronesia, Federated States of',
2636 'MD': 'Moldova, Republic of',
2647 'NL': 'Netherlands',
2648 'NC': 'New Caledonia',
2649 'NZ': 'New Zealand',
2654 'NF': 'Norfolk Island',
2655 'MP': 'Northern Mariana Islands',
2660 'PS': 'Palestine, State of',
2662 'PG': 'Papua New Guinea',
2665 'PH': 'Philippines',
2669 'PR': 'Puerto Rico',
2673 'RU': 'Russian Federation',
2675 'BL': 'Saint Barthélemy',
2676 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2677 'KN': 'Saint Kitts and Nevis',
2678 'LC': 'Saint Lucia',
2679 'MF': 'Saint Martin (French part)',
2680 'PM': 'Saint Pierre and Miquelon',
2681 'VC': 'Saint Vincent and the Grenadines',
2684 'ST': 'Sao Tome and Principe',
2685 'SA': 'Saudi Arabia',
2689 'SL': 'Sierra Leone',
2691 'SX': 'Sint Maarten (Dutch part)',
2694 'SB': 'Solomon Islands',
2696 'ZA': 'South Africa',
2697 'GS': 'South Georgia and the South Sandwich Islands',
2698 'SS': 'South Sudan',
2703 'SJ': 'Svalbard and Jan Mayen',
2706 'CH': 'Switzerland',
2707 'SY': 'Syrian Arab Republic',
2708 'TW': 'Taiwan, Province of China',
2710 'TZ': 'Tanzania, United Republic of',
2712 'TL': 'Timor-Leste',
2716 'TT': 'Trinidad and Tobago',
2719 'TM': 'Turkmenistan',
2720 'TC': 'Turks and Caicos Islands',
2724 'AE': 'United Arab Emirates',
2725 'GB': 'United Kingdom',
2726 'US': 'United States',
2727 'UM': 'United States Minor Outlying Islands',
2731 'VE': 'Venezuela, Bolivarian Republic of',
2733 'VG': 'Virgin Islands, British',
2734 'VI': 'Virgin Islands, U.S.',
2735 'WF': 'Wallis and Futuna',
2736 'EH': 'Western Sahara',
2743 def short2full(cls, code):
2744 """Convert an ISO 3166-2 country code to the corresponding full name"""
2745 return cls._country_map.get(code.upper())
2748 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2749 def __init__(self, proxies=None):
2750 # Set default handlers
2751 for type in ('http', 'https'):
2752 setattr(self, '%s_open' % type,
2753 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2754 meth(r, proxy, type))
2755 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2757 def proxy_open(self, req, proxy, type):
2758 req_proxy = req.headers.get('Ytdl-request-proxy')
2759 if req_proxy is not None:
2761 del req.headers['Ytdl-request-proxy']
2763 if proxy == '__noproxy__':
2764 return None # No Proxy
2765 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2766 req.add_header('Ytdl-socks-proxy', proxy)
2767 # youtube-dl's http/https handlers do wrapping the socket with socks
2769 return compat_urllib_request.ProxyHandler.proxy_open(
2770 self, req, proxy, type)
2773 def ohdave_rsa_encrypt(data, exponent, modulus):
2775 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2778 data: data to encrypt, bytes-like object
2779 exponent, modulus: parameter e and N of RSA algorithm, both integer
2780 Output: hex string of encrypted data
2782 Limitation: supports one block encryption only
2785 payload = int(binascii.hexlify(data[::-1]), 16)
2786 encrypted = pow(payload, exponent, modulus)
2787 return '%x' % encrypted
2790 def encode_base_n(num, n, table=None):
2791 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2793 table = FULL_TABLE[:n]
2796 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2803 ret = table[num % n] + ret
2808 def decode_packed_codes(code):
2810 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2812 obfucasted_code, base, count, symbols = mobj.groups()
2815 symbols = symbols.split('|')
2820 base_n_count = encode_base_n(count, base)
2821 symbol_table[base_n_count] = symbols[count] or base_n_count
2824 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],