2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
46 compat_socket_create_connection,
51 compat_urllib_parse_urlencode,
52 compat_urllib_parse_urlparse,
53 compat_urllib_parse_unquote_plus,
54 compat_urllib_request,
65 def register_socks_protocols():
66 # "Register" SOCKS protocols
67 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
68 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
69 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
70 if scheme not in compat_urlparse.uses_netloc:
71 compat_urlparse.uses_netloc.append(scheme)
74 # This is not clearly defined otherwise
75 compiled_regex_type = type(re.compile(''))
78 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
79 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
80 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
81 'Accept-Encoding': 'gzip, deflate',
82 'Accept-Language': 'en-us,en;q=0.5',
88 ENGLISH_MONTH_NAMES = [
89 'January', 'February', 'March', 'April', 'May', 'June',
90 'July', 'August', 'September', 'October', 'November', 'December']
93 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
94 'flv', 'f4v', 'f4a', 'f4b',
95 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
105 'f4f', 'f4m', 'm3u8', 'smil')
107 # needed for sanitizing filenames in restricted mode
108 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ',
109 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
110 'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
113 def preferredencoding():
114 """Get preferred encoding.
116 Returns the best encoding scheme for the system, based on
117 locale.getpreferredencoding() and some further tweaks.
120 pref = locale.getpreferredencoding()
128 def write_json_file(obj, fn):
129 """ Encode obj as JSON and write it to fn, atomically if possible """
131 fn = encodeFilename(fn)
132 if sys.version_info < (3, 0) and sys.platform != 'win32':
133 encoding = get_filesystem_encoding()
134 # os.path.basename returns a bytes object, but NamedTemporaryFile
135 # will fail if the filename contains non ascii characters unless we
136 # use a unicode object
137 path_basename = lambda f: os.path.basename(fn).decode(encoding)
138 # the same for os.path.dirname
139 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
141 path_basename = os.path.basename
142 path_dirname = os.path.dirname
146 'prefix': path_basename(fn) + '.',
147 'dir': path_dirname(fn),
151 # In Python 2.x, json.dump expects a bytestream.
152 # In Python 3.x, it writes to a character stream
153 if sys.version_info < (3, 0):
161 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
166 if sys.platform == 'win32':
167 # Need to remove existing file on Windows, else os.rename raises
168 # WindowsError or FileExistsError.
173 os.rename(tf.name, fn)
182 if sys.version_info >= (2, 7):
183 def find_xpath_attr(node, xpath, key, val=None):
184 """ Find the xpath xpath[@key=val] """
185 assert re.match(r'^[a-zA-Z_-]+$', key)
186 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
187 return node.find(expr)
189 def find_xpath_attr(node, xpath, key, val=None):
190 for f in node.findall(compat_xpath(xpath)):
191 if key not in f.attrib:
193 if val is None or f.attrib.get(key) == val:
197 # On python2.6 the xml.etree.ElementTree.Element methods don't support
198 # the namespace parameter
201 def xpath_with_ns(path, ns_map):
202 components = [c.split(':') for c in path.split('/')]
206 replaced.append(c[0])
209 replaced.append('{%s}%s' % (ns_map[ns], tag))
210 return '/'.join(replaced)
213 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
214 def _find_xpath(xpath):
215 return node.find(compat_xpath(xpath))
217 if isinstance(xpath, (str, compat_str)):
218 n = _find_xpath(xpath)
226 if default is not NO_DEFAULT:
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element %s' % name)
236 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
237 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
238 if n is None or n == default:
241 if default is not NO_DEFAULT:
244 name = xpath if name is None else name
245 raise ExtractorError('Could not find XML element\'s text %s' % name)
251 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
252 n = find_xpath_attr(node, xpath, key)
254 if default is not NO_DEFAULT:
257 name = '%s[@%s]' % (xpath, key) if name is None else name
258 raise ExtractorError('Could not find XML attribute %s' % name)
264 def get_element_by_id(id, html):
265 """Return the content of the tag with the specified ID in the passed HTML document"""
266 return get_element_by_attribute('id', id, html)
269 def get_element_by_attribute(attribute, value, html):
270 """Return the content of the tag with the specified attribute in the passed HTML document"""
272 m = re.search(r'''(?xs)
274 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
276 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
280 ''' % (re.escape(attribute), re.escape(value)), html)
284 res = m.group('content')
286 if res.startswith('"') or res.startswith("'"):
289 return unescapeHTML(res)
292 class HTMLAttributeParser(compat_HTMLParser):
293 """Trivial HTML parser to gather the attributes for a single element"""
296 compat_HTMLParser.__init__(self)
298 def handle_starttag(self, tag, attrs):
299 self.attrs = dict(attrs)
302 def extract_attributes(html_element):
303 """Given a string for an HTML element such as
305 a="foo" B="bar" c="&98;az" d=boz
306 empty= noval entity="&"
309 Decode and return a dictionary of attributes.
311 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
312 'empty': '', 'noval': None, 'entity': '&',
313 'sq': '"', 'dq': '\''
315 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
316 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
318 parser = HTMLAttributeParser()
319 parser.feed(html_element)
324 def clean_html(html):
325 """Clean an HTML snippet into a readable string"""
327 if html is None: # Convenience for sanitizing descriptions etc.
331 html = html.replace('\n', ' ')
332 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
333 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
335 html = re.sub('<.*?>', '', html)
336 # Replace html entities
337 html = unescapeHTML(html)
341 def sanitize_open(filename, open_mode):
342 """Try to open the given filename, and slightly tweak it if this fails.
344 Attempts to open the given filename. If this fails, it tries to change
345 the filename slightly, step by step, until it's either able to open it
346 or it fails and raises a final exception, like the standard open()
349 It returns the tuple (stream, definitive_file_name).
353 if sys.platform == 'win32':
355 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
356 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, filename)
359 except (IOError, OSError) as err:
360 if err.errno in (errno.EACCES,):
363 # In case of error, try to remove win32 forbidden chars
364 alt_filename = sanitize_path(filename)
365 if alt_filename == filename:
368 # An exception here should be caught in the caller
369 stream = open(encodeFilename(alt_filename), open_mode)
370 return (stream, alt_filename)
373 def timeconvert(timestr):
374 """Convert RFC 2822 defined time string into system timestamp"""
376 timetuple = email.utils.parsedate_tz(timestr)
377 if timetuple is not None:
378 timestamp = email.utils.mktime_tz(timetuple)
382 def sanitize_filename(s, restricted=False, is_id=False):
383 """Sanitizes a string so it could be used as part of a filename.
384 If restricted is set, use a stricter subset of allowed characters.
385 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
387 def replace_insane(char):
388 if restricted and char in ACCENT_CHARS:
389 return ACCENT_CHARS[char]
390 if char == '?' or ord(char) < 32 or ord(char) == 127:
393 return '' if restricted else '\''
395 return '_-' if restricted else ' -'
396 elif char in '\\/|*<>':
398 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
400 if restricted and ord(char) > 127:
405 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
406 result = ''.join(map(replace_insane, s))
408 while '__' in result:
409 result = result.replace('__', '_')
410 result = result.strip('_')
411 # Common case of "Foreign band name - English song title"
412 if restricted and result.startswith('-_'):
414 if result.startswith('-'):
415 result = '_' + result[len('-'):]
416 result = result.lstrip('.')
422 def sanitize_path(s):
423 """Sanitizes and normalizes path on Windows"""
424 if sys.platform != 'win32':
426 drive_or_unc, _ = os.path.splitdrive(s)
427 if sys.version_info < (2, 7) and not drive_or_unc:
428 drive_or_unc, _ = os.path.splitunc(s)
429 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
433 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
434 for path_part in norm_path]
436 sanitized_path.insert(0, drive_or_unc + os.path.sep)
437 return os.path.join(*sanitized_path)
440 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
441 # unwanted failures due to missing protocol
442 def sanitize_url(url):
443 return 'http:%s' % url if url.startswith('//') else url
446 def sanitized_Request(url, *args, **kwargs):
447 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
450 def orderedSet(iterable):
451 """ Remove all duplicates from the input iterable """
459 def _htmlentity_transform(entity):
460 """Transforms an HTML entity to a character."""
461 # Known non-numeric HTML entity
462 if entity in compat_html_entities.name2codepoint:
463 return compat_chr(compat_html_entities.name2codepoint[entity])
465 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
467 numstr = mobj.group(1)
468 if numstr.startswith('x'):
470 numstr = '0%s' % numstr
473 # See https://github.com/rg3/youtube-dl/issues/7518
475 return compat_chr(int(numstr, base))
479 # Unknown entity in name, return its literal representation
480 return '&%s;' % entity
486 assert type(s) == compat_str
489 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
492 def get_subprocess_encoding():
493 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
494 # For subprocess calls, encode with locale encoding
495 # Refer to http://stackoverflow.com/a/9951851/35070
496 encoding = preferredencoding()
498 encoding = sys.getfilesystemencoding()
504 def encodeFilename(s, for_subprocess=False):
506 @param s The name of the file
509 assert type(s) == compat_str
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
515 # Pass '' directly to use Unicode APIs on Windows 2000 and up
516 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
517 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
518 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
521 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
522 if sys.platform.startswith('java'):
525 return s.encode(get_subprocess_encoding(), 'ignore')
528 def decodeFilename(b, for_subprocess=False):
530 if sys.version_info >= (3, 0):
533 if not isinstance(b, bytes):
536 return b.decode(get_subprocess_encoding(), 'ignore')
539 def encodeArgument(s):
540 if not isinstance(s, compat_str):
541 # Legacy code that uses byte strings
542 # Uncomment the following line after fixing all post processors
543 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
544 s = s.decode('ascii')
545 return encodeFilename(s, True)
548 def decodeArgument(b):
549 return decodeFilename(b, True)
552 def decodeOption(optval):
555 if isinstance(optval, bytes):
556 optval = optval.decode(preferredencoding())
558 assert isinstance(optval, compat_str)
562 def formatSeconds(secs):
564 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
566 return '%d:%02d' % (secs // 60, secs % 60)
571 def make_HTTPS_handler(params, **kwargs):
572 opts_no_check_certificate = params.get('nocheckcertificate', False)
573 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
574 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
575 if opts_no_check_certificate:
576 context.check_hostname = False
577 context.verify_mode = ssl.CERT_NONE
579 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
582 # (create_default_context present but HTTPSHandler has no context=)
585 if sys.version_info < (3, 2):
586 return YoutubeDLHTTPSHandler(params, **kwargs)
588 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
589 context.verify_mode = (ssl.CERT_NONE
590 if opts_no_check_certificate
591 else ssl.CERT_REQUIRED)
592 context.set_default_verify_paths()
593 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
596 def bug_reports_message():
597 if ytdl_is_updateable():
598 update_cmd = 'type youtube-dl -U to update'
600 update_cmd = 'see https://yt-dl.org/update on how to update'
601 msg = '; please report this issue on https://yt-dl.org/bug .'
602 msg += ' Make sure you are using the latest version; %s.' % update_cmd
603 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
607 class ExtractorError(Exception):
608 """Error during info extraction."""
610 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
611 """ tb, if given, is the original traceback (so that it can be printed out).
612 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
615 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
617 if video_id is not None:
618 msg = video_id + ': ' + msg
620 msg += ' (caused by %r)' % cause
622 msg += bug_reports_message()
623 super(ExtractorError, self).__init__(msg)
626 self.exc_info = sys.exc_info() # preserve original exception
628 self.video_id = video_id
630 def format_traceback(self):
631 if self.traceback is None:
633 return ''.join(traceback.format_tb(self.traceback))
636 class UnsupportedError(ExtractorError):
637 def __init__(self, url):
638 super(UnsupportedError, self).__init__(
639 'Unsupported URL: %s' % url, expected=True)
643 class RegexNotFoundError(ExtractorError):
644 """Error when a regex didn't match"""
648 class DownloadError(Exception):
649 """Download Error exception.
651 This exception may be thrown by FileDownloader objects if they are not
652 configured to continue on errors. They will contain the appropriate
656 def __init__(self, msg, exc_info=None):
657 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
658 super(DownloadError, self).__init__(msg)
659 self.exc_info = exc_info
662 class SameFileError(Exception):
663 """Same File exception.
665 This exception will be thrown by FileDownloader objects if they detect
666 multiple files would have to be downloaded to the same file on disk.
671 class PostProcessingError(Exception):
672 """Post Processing exception.
674 This exception may be raised by PostProcessor's .run() method to
675 indicate an error in the postprocessing task.
678 def __init__(self, msg):
682 class MaxDownloadsReached(Exception):
683 """ --max-downloads limit has been reached. """
687 class UnavailableVideoError(Exception):
688 """Unavailable Format exception.
690 This exception will be thrown when a video is requested
691 in a format that is not available for that video.
696 class ContentTooShortError(Exception):
697 """Content Too Short exception.
699 This exception may be raised by FileDownloader objects when a file they
700 download is too small for what the server announced first, indicating
701 the connection was probably interrupted.
704 def __init__(self, downloaded, expected):
706 self.downloaded = downloaded
707 self.expected = expected
710 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
711 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
712 # expected HTTP responses to meet HTTP/1.0 or later (see also
713 # https://github.com/rg3/youtube-dl/issues/6727)
714 if sys.version_info < (3, 0):
715 kwargs[b'strict'] = True
716 hc = http_class(*args, **kwargs)
717 source_address = ydl_handler._params.get('source_address')
718 if source_address is not None:
719 sa = (source_address, 0)
720 if hasattr(hc, 'source_address'): # Python 2.7+
721 hc.source_address = sa
723 def _hc_connect(self, *args, **kwargs):
724 sock = compat_socket_create_connection(
725 (self.host, self.port), self.timeout, sa)
727 self.sock = ssl.wrap_socket(
728 sock, self.key_file, self.cert_file,
729 ssl_version=ssl.PROTOCOL_TLSv1)
732 hc.connect = functools.partial(_hc_connect, hc)
737 def handle_youtubedl_headers(headers):
738 filtered_headers = headers
740 if 'Youtubedl-no-compression' in filtered_headers:
741 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
742 del filtered_headers['Youtubedl-no-compression']
744 return filtered_headers
747 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
748 """Handler for HTTP requests and responses.
750 This class, when installed with an OpenerDirector, automatically adds
751 the standard headers to every HTTP request and handles gzipped and
752 deflated responses from web servers. If compression is to be avoided in
753 a particular request, the original request in the program code only has
754 to include the HTTP header "Youtubedl-no-compression", which will be
755 removed before making the real request.
757 Part of this code was copied from:
759 http://techknack.net/python-urllib2-handlers/
761 Andrew Rowls, the author of that code, agreed to release it to the
765 def __init__(self, params, *args, **kwargs):
766 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
767 self._params = params
769 def http_open(self, req):
770 conn_class = compat_http_client.HTTPConnection
772 socks_proxy = req.headers.get('Ytdl-socks-proxy')
774 conn_class = make_socks_conn_class(conn_class, socks_proxy)
775 del req.headers['Ytdl-socks-proxy']
777 return self.do_open(functools.partial(
778 _create_http_connection, self, conn_class, False),
784 return zlib.decompress(data, -zlib.MAX_WBITS)
786 return zlib.decompress(data)
789 def addinfourl_wrapper(stream, headers, url, code):
790 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
791 return compat_urllib_request.addinfourl(stream, headers, url, code)
792 ret = compat_urllib_request.addinfourl(stream, headers, url)
796 def http_request(self, req):
797 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
798 # always respected by websites, some tend to give out URLs with non percent-encoded
799 # non-ASCII characters (see telemb.py, ard.py [#3412])
800 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
801 # To work around aforementioned issue we will replace request's original URL with
802 # percent-encoded one
803 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
804 # the code of this workaround has been moved here from YoutubeDL.urlopen()
805 url = req.get_full_url()
806 url_escaped = escape_url(url)
808 # Substitute URL if any change after escaping
809 if url != url_escaped:
810 req = update_Request(req, url=url_escaped)
812 for h, v in std_headers.items():
813 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
814 # The dict keys are capitalized because of this bug by urllib
815 if h.capitalize() not in req.headers:
818 req.headers = handle_youtubedl_headers(req.headers)
820 if sys.version_info < (2, 7) and '#' in req.get_full_url():
821 # Python 2.6 is brain-dead when it comes to fragments
822 req._Request__original = req._Request__original.partition('#')[0]
823 req._Request__r_type = req._Request__r_type.partition('#')[0]
827 def http_response(self, req, resp):
830 if resp.headers.get('Content-encoding', '') == 'gzip':
831 content = resp.read()
832 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
834 uncompressed = io.BytesIO(gz.read())
835 except IOError as original_ioerror:
836 # There may be junk add the end of the file
837 # See http://stackoverflow.com/q/4928560/35070 for details
838 for i in range(1, 1024):
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
841 uncompressed = io.BytesIO(gz.read())
846 raise original_ioerror
847 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
848 resp.msg = old_resp.msg
849 del resp.headers['Content-encoding']
851 if resp.headers.get('Content-encoding', '') == 'deflate':
852 gz = io.BytesIO(self.deflate(resp.read()))
853 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
854 resp.msg = old_resp.msg
855 del resp.headers['Content-encoding']
856 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
857 # https://github.com/rg3/youtube-dl/issues/6457).
858 if 300 <= resp.code < 400:
859 location = resp.headers.get('Location')
861 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
862 if sys.version_info >= (3, 0):
863 location = location.encode('iso-8859-1').decode('utf-8')
864 location_escaped = escape_url(location)
865 if location != location_escaped:
866 del resp.headers['Location']
867 resp.headers['Location'] = location_escaped
870 https_request = http_request
871 https_response = http_response
874 def make_socks_conn_class(base_class, socks_proxy):
875 assert issubclass(base_class, (
876 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
878 url_components = compat_urlparse.urlparse(socks_proxy)
879 if url_components.scheme.lower() == 'socks5':
880 socks_type = ProxyType.SOCKS5
881 elif url_components.scheme.lower() in ('socks', 'socks4'):
882 socks_type = ProxyType.SOCKS4
883 elif url_components.scheme.lower() == 'socks4a':
884 socks_type = ProxyType.SOCKS4A
886 def unquote_if_non_empty(s):
889 return compat_urllib_parse_unquote_plus(s)
893 url_components.hostname, url_components.port or 1080,
895 unquote_if_non_empty(url_components.username),
896 unquote_if_non_empty(url_components.password),
899 class SocksConnection(base_class):
901 self.sock = sockssocket()
902 self.sock.setproxy(*proxy_args)
903 if type(self.timeout) in (int, float):
904 self.sock.settimeout(self.timeout)
905 self.sock.connect((self.host, self.port))
907 if isinstance(self, compat_http_client.HTTPSConnection):
908 if hasattr(self, '_context'): # Python > 2.6
909 self.sock = self._context.wrap_socket(
910 self.sock, server_hostname=self.host)
912 self.sock = ssl.wrap_socket(self.sock)
914 return SocksConnection
917 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
918 def __init__(self, params, https_conn_class=None, *args, **kwargs):
919 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
920 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
921 self._params = params
923 def https_open(self, req):
925 conn_class = self._https_conn_class
927 if hasattr(self, '_context'): # python > 2.6
928 kwargs['context'] = self._context
929 if hasattr(self, '_check_hostname'): # python 3.x
930 kwargs['check_hostname'] = self._check_hostname
932 socks_proxy = req.headers.get('Ytdl-socks-proxy')
934 conn_class = make_socks_conn_class(conn_class, socks_proxy)
935 del req.headers['Ytdl-socks-proxy']
937 return self.do_open(functools.partial(
938 _create_http_connection, self, conn_class, True),
942 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
943 def __init__(self, cookiejar=None):
944 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
946 def http_response(self, request, response):
947 # Python 2 will choke on next HTTP request in row if there are non-ASCII
948 # characters in Set-Cookie HTTP header of last response (see
949 # https://github.com/rg3/youtube-dl/issues/6769).
950 # In order to at least prevent crashing we will percent encode Set-Cookie
951 # header before HTTPCookieProcessor starts processing it.
952 # if sys.version_info < (3, 0) and response.headers:
953 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
954 # set_cookie = response.headers.get(set_cookie_header)
956 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
957 # if set_cookie != set_cookie_escaped:
958 # del response.headers[set_cookie_header]
959 # response.headers[set_cookie_header] = set_cookie_escaped
960 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
962 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
963 https_response = http_response
966 def parse_iso8601(date_str, delimiter='T', timezone=None):
967 """ Return a UNIX timestamp from the given date """
972 date_str = re.sub(r'\.[0-9]+', '', date_str)
976 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
979 timezone = datetime.timedelta()
981 date_str = date_str[:-len(m.group(0))]
982 if not m.group('sign'):
983 timezone = datetime.timedelta()
985 sign = 1 if m.group('sign') == '+' else -1
986 timezone = datetime.timedelta(
987 hours=sign * int(m.group('hours')),
988 minutes=sign * int(m.group('minutes')))
990 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
991 dt = datetime.datetime.strptime(date_str, date_format) - timezone
992 return calendar.timegm(dt.timetuple())
997 def unified_strdate(date_str, day_first=True):
998 """Return a string with the date in the format YYYYMMDD"""
1000 if date_str is None:
1004 date_str = date_str.replace(',', ' ')
1005 # %z (UTC offset) is only supported in python>=3.2
1006 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1007 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1008 # Remove AM/PM + timezone
1009 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1011 format_expressions = [
1022 '%Y/%m/%d %H:%M:%S',
1023 '%Y-%m-%d %H:%M:%S',
1024 '%Y-%m-%d %H:%M:%S.%f',
1027 '%Y-%m-%dT%H:%M:%SZ',
1028 '%Y-%m-%dT%H:%M:%S.%fZ',
1029 '%Y-%m-%dT%H:%M:%S.%f0Z',
1030 '%Y-%m-%dT%H:%M:%S',
1031 '%Y-%m-%dT%H:%M:%S.%f',
1035 format_expressions.extend([
1041 '%d/%m/%Y %H:%M:%S',
1044 format_expressions.extend([
1049 '%m/%d/%Y %H:%M:%S',
1051 for expression in format_expressions:
1053 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1056 if upload_date is None:
1057 timetuple = email.utils.parsedate_tz(date_str)
1060 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1063 if upload_date is not None:
1064 return compat_str(upload_date)
1067 def determine_ext(url, default_ext='unknown_video'):
1070 guess = url.partition('?')[0].rpartition('.')[2]
1071 if re.match(r'^[A-Za-z0-9]+$', guess):
1073 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1074 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1075 return guess.rstrip('/')
1080 def subtitles_filename(filename, sub_lang, sub_format):
1081 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1084 def date_from_str(date_str):
1086 Return a datetime object from a string in the format YYYYMMDD or
1087 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1088 today = datetime.date.today()
1089 if date_str in ('now', 'today'):
1091 if date_str == 'yesterday':
1092 return today - datetime.timedelta(days=1)
1093 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1094 if match is not None:
1095 sign = match.group('sign')
1096 time = int(match.group('time'))
1099 unit = match.group('unit')
1100 # A bad approximation?
1104 elif unit == 'year':
1108 delta = datetime.timedelta(**{unit: time})
1109 return today + delta
1110 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1113 def hyphenate_date(date_str):
1115 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1116 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1117 if match is not None:
1118 return '-'.join(match.groups())
1123 class DateRange(object):
1124 """Represents a time interval between two dates"""
1126 def __init__(self, start=None, end=None):
1127 """start and end must be strings in the format accepted by date"""
1128 if start is not None:
1129 self.start = date_from_str(start)
1131 self.start = datetime.datetime.min.date()
1133 self.end = date_from_str(end)
1135 self.end = datetime.datetime.max.date()
1136 if self.start > self.end:
1137 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1141 """Returns a range that only contains the given day"""
1142 return cls(day, day)
1144 def __contains__(self, date):
1145 """Check if the date is in the range"""
1146 if not isinstance(date, datetime.date):
1147 date = date_from_str(date)
1148 return self.start <= date <= self.end
1151 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1154 def platform_name():
1155 """ Returns the platform name as a compat_str """
1156 res = platform.platform()
1157 if isinstance(res, bytes):
1158 res = res.decode(preferredencoding())
1160 assert isinstance(res, compat_str)
1164 def _windows_write_string(s, out):
1165 """ Returns True if the string was written using special methods,
1166 False if it has yet to be written out."""
1167 # Adapted from http://stackoverflow.com/a/3259271/35070
1170 import ctypes.wintypes
1178 fileno = out.fileno()
1179 except AttributeError:
1180 # If the output stream doesn't have a fileno, it's virtual
1182 except io.UnsupportedOperation:
1183 # Some strange Windows pseudo files?
1185 if fileno not in WIN_OUTPUT_IDS:
1188 GetStdHandle = ctypes.WINFUNCTYPE(
1189 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1190 (b'GetStdHandle', ctypes.windll.kernel32))
1191 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1193 WriteConsoleW = ctypes.WINFUNCTYPE(
1194 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1195 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1196 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1197 written = ctypes.wintypes.DWORD(0)
1199 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1200 FILE_TYPE_CHAR = 0x0002
1201 FILE_TYPE_REMOTE = 0x8000
1202 GetConsoleMode = ctypes.WINFUNCTYPE(
1203 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1204 ctypes.POINTER(ctypes.wintypes.DWORD))(
1205 (b'GetConsoleMode', ctypes.windll.kernel32))
1206 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1208 def not_a_console(handle):
1209 if handle == INVALID_HANDLE_VALUE or handle is None:
1211 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1212 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1214 if not_a_console(h):
1217 def next_nonbmp_pos(s):
1219 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1220 except StopIteration:
1224 count = min(next_nonbmp_pos(s), 1024)
1226 ret = WriteConsoleW(
1227 h, s, count if count else 2, ctypes.byref(written), None)
1229 raise OSError('Failed to write string')
1230 if not count: # We just wrote a non-BMP character
1231 assert written.value == 2
1234 assert written.value > 0
1235 s = s[written.value:]
1239 def write_string(s, out=None, encoding=None):
1242 assert type(s) == compat_str
1244 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1245 if _windows_write_string(s, out):
1248 if ('b' in getattr(out, 'mode', '') or
1249 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1250 byt = s.encode(encoding or preferredencoding(), 'ignore')
1252 elif hasattr(out, 'buffer'):
1253 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1254 byt = s.encode(enc, 'ignore')
1255 out.buffer.write(byt)
1261 def bytes_to_intlist(bs):
1264 if isinstance(bs[0], int): # Python 3
1267 return [ord(c) for c in bs]
1270 def intlist_to_bytes(xs):
1273 return compat_struct_pack('%dB' % len(xs), *xs)
1276 # Cross-platform file locking
1277 if sys.platform == 'win32':
1278 import ctypes.wintypes
1281 class OVERLAPPED(ctypes.Structure):
1283 ('Internal', ctypes.wintypes.LPVOID),
1284 ('InternalHigh', ctypes.wintypes.LPVOID),
1285 ('Offset', ctypes.wintypes.DWORD),
1286 ('OffsetHigh', ctypes.wintypes.DWORD),
1287 ('hEvent', ctypes.wintypes.HANDLE),
1290 kernel32 = ctypes.windll.kernel32
1291 LockFileEx = kernel32.LockFileEx
1292 LockFileEx.argtypes = [
1293 ctypes.wintypes.HANDLE, # hFile
1294 ctypes.wintypes.DWORD, # dwFlags
1295 ctypes.wintypes.DWORD, # dwReserved
1296 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1297 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1298 ctypes.POINTER(OVERLAPPED) # Overlapped
1300 LockFileEx.restype = ctypes.wintypes.BOOL
1301 UnlockFileEx = kernel32.UnlockFileEx
1302 UnlockFileEx.argtypes = [
1303 ctypes.wintypes.HANDLE, # hFile
1304 ctypes.wintypes.DWORD, # dwReserved
1305 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1306 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1307 ctypes.POINTER(OVERLAPPED) # Overlapped
1309 UnlockFileEx.restype = ctypes.wintypes.BOOL
1310 whole_low = 0xffffffff
1311 whole_high = 0x7fffffff
1313 def _lock_file(f, exclusive):
1314 overlapped = OVERLAPPED()
1315 overlapped.Offset = 0
1316 overlapped.OffsetHigh = 0
1317 overlapped.hEvent = 0
1318 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1319 handle = msvcrt.get_osfhandle(f.fileno())
1320 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1321 whole_low, whole_high, f._lock_file_overlapped_p):
1322 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1324 def _unlock_file(f):
1325 assert f._lock_file_overlapped_p
1326 handle = msvcrt.get_osfhandle(f.fileno())
1327 if not UnlockFileEx(handle, 0,
1328 whole_low, whole_high, f._lock_file_overlapped_p):
1329 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1332 # Some platforms, such as Jython, is missing fcntl
1336 def _lock_file(f, exclusive):
1337 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1339 def _unlock_file(f):
1340 fcntl.flock(f, fcntl.LOCK_UN)
1342 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1344 def _lock_file(f, exclusive):
1345 raise IOError(UNSUPPORTED_MSG)
1347 def _unlock_file(f):
1348 raise IOError(UNSUPPORTED_MSG)
1351 class locked_file(object):
1352 def __init__(self, filename, mode, encoding=None):
1353 assert mode in ['r', 'a', 'w']
1354 self.f = io.open(filename, mode, encoding=encoding)
1357 def __enter__(self):
1358 exclusive = self.mode != 'r'
1360 _lock_file(self.f, exclusive)
1366 def __exit__(self, etype, value, traceback):
1368 _unlock_file(self.f)
1375 def write(self, *args):
1376 return self.f.write(*args)
1378 def read(self, *args):
1379 return self.f.read(*args)
1382 def get_filesystem_encoding():
1383 encoding = sys.getfilesystemencoding()
1384 return encoding if encoding is not None else 'utf-8'
1387 def shell_quote(args):
1389 encoding = get_filesystem_encoding()
1391 if isinstance(a, bytes):
1392 # We may get a filename encoded with 'encodeFilename'
1393 a = a.decode(encoding)
1394 quoted_args.append(pipes.quote(a))
1395 return ' '.join(quoted_args)
1398 def smuggle_url(url, data):
1399 """ Pass additional data in a URL for internal use. """
1401 sdata = compat_urllib_parse_urlencode(
1402 {'__youtubedl_smuggle': json.dumps(data)})
1403 return url + '#' + sdata
1406 def unsmuggle_url(smug_url, default=None):
1407 if '#__youtubedl_smuggle' not in smug_url:
1408 return smug_url, default
1409 url, _, sdata = smug_url.rpartition('#')
1410 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1411 data = json.loads(jsond)
1415 def format_bytes(bytes):
1418 if type(bytes) is str:
1419 bytes = float(bytes)
1423 exponent = int(math.log(bytes, 1024.0))
1424 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1425 converted = float(bytes) / float(1024 ** exponent)
1426 return '%.2f%s' % (converted, suffix)
1429 def lookup_unit_table(unit_table, s):
1430 units_re = '|'.join(re.escape(u) for u in unit_table)
1432 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1435 num_str = m.group('num').replace(',', '.')
1436 mult = unit_table[m.group('unit')]
1437 return int(float(num_str) * mult)
1440 def parse_filesize(s):
1444 # The lower-case forms are of course incorrect and unofficial,
1445 # but we support those too
1483 return lookup_unit_table(_UNIT_TABLE, s)
1492 if re.match(r'^[\d,.]+$', s):
1493 return str_to_int(s)
1504 return lookup_unit_table(_UNIT_TABLE, s)
1507 def month_by_name(name):
1508 """ Return the number of a month by (locale-independently) English name """
1511 return ENGLISH_MONTH_NAMES.index(name) + 1
1516 def month_by_abbreviation(abbrev):
1517 """ Return the number of a month by (locale-independently) English
1521 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1526 def fix_xml_ampersands(xml_str):
1527 """Replace all the '&' by '&' in XML"""
1529 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1534 def setproctitle(title):
1535 assert isinstance(title, compat_str)
1537 # ctypes in Jython is not complete
1538 # http://bugs.jython.org/issue2148
1539 if sys.platform.startswith('java'):
1543 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1546 title_bytes = title.encode('utf-8')
1547 buf = ctypes.create_string_buffer(len(title_bytes))
1548 buf.value = title_bytes
1550 libc.prctl(15, buf, 0, 0, 0)
1551 except AttributeError:
1552 return # Strange libc, just skip this
1555 def remove_start(s, start):
1556 return s[len(start):] if s is not None and s.startswith(start) else s
1559 def remove_end(s, end):
1560 return s[:-len(end)] if s is not None and s.endswith(end) else s
1563 def remove_quotes(s):
1564 if s is None or len(s) < 2:
1566 for quote in ('"', "'", ):
1567 if s[0] == quote and s[-1] == quote:
1572 def url_basename(url):
1573 path = compat_urlparse.urlparse(url).path
1574 return path.strip('/').split('/')[-1]
1577 class HEADRequest(compat_urllib_request.Request):
1578 def get_method(self):
1582 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1585 v = getattr(v, get_attr, None)
1591 return int(v) * invscale // scale
1596 def str_or_none(v, default=None):
1597 return default if v is None else compat_str(v)
1600 def str_to_int(int_str):
1601 """ A more relaxed version of int_or_none """
1604 int_str = re.sub(r'[,\.\+]', '', int_str)
1608 def float_or_none(v, scale=1, invscale=1, default=None):
1612 return float(v) * invscale / scale
1617 def parse_duration(s):
1618 if not isinstance(s, compat_basestring):
1623 days, hours, mins, secs, ms = [None] * 5
1624 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1626 days, hours, mins, secs, ms = m.groups()
1631 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1634 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1637 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1640 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1643 days, hours, mins, secs, ms = m.groups()
1645 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1647 hours, mins = m.groups()
1653 duration += float(secs)
1655 duration += float(mins) * 60
1657 duration += float(hours) * 60 * 60
1659 duration += float(days) * 24 * 60 * 60
1661 duration += float(ms)
1665 def prepend_extension(filename, ext, expected_real_ext=None):
1666 name, real_ext = os.path.splitext(filename)
1668 '{0}.{1}{2}'.format(name, ext, real_ext)
1669 if not expected_real_ext or real_ext[1:] == expected_real_ext
1670 else '{0}.{1}'.format(filename, ext))
1673 def replace_extension(filename, ext, expected_real_ext=None):
1674 name, real_ext = os.path.splitext(filename)
1675 return '{0}.{1}'.format(
1676 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1680 def check_executable(exe, args=[]):
1681 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1682 args can be a list of arguments for a short output (like -version) """
1684 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1690 def get_exe_version(exe, args=['--version'],
1691 version_re=None, unrecognized='present'):
1692 """ Returns the version of the specified executable,
1693 or False if the executable is not present """
1695 out, _ = subprocess.Popen(
1696 [encodeArgument(exe)] + args,
1697 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1700 if isinstance(out, bytes): # Python 2.x
1701 out = out.decode('ascii', 'ignore')
1702 return detect_exe_version(out, version_re, unrecognized)
1705 def detect_exe_version(output, version_re=None, unrecognized='present'):
1706 assert isinstance(output, compat_str)
1707 if version_re is None:
1708 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1709 m = re.search(version_re, output)
1716 class PagedList(object):
1718 # This is only useful for tests
1719 return len(self.getslice())
1722 class OnDemandPagedList(PagedList):
1723 def __init__(self, pagefunc, pagesize, use_cache=False):
1724 self._pagefunc = pagefunc
1725 self._pagesize = pagesize
1726 self._use_cache = use_cache
1730 def getslice(self, start=0, end=None):
1732 for pagenum in itertools.count(start // self._pagesize):
1733 firstid = pagenum * self._pagesize
1734 nextfirstid = pagenum * self._pagesize + self._pagesize
1735 if start >= nextfirstid:
1740 page_results = self._cache.get(pagenum)
1741 if page_results is None:
1742 page_results = list(self._pagefunc(pagenum))
1744 self._cache[pagenum] = page_results
1747 start % self._pagesize
1748 if firstid <= start < nextfirstid
1752 ((end - 1) % self._pagesize) + 1
1753 if (end is not None and firstid <= end <= nextfirstid)
1756 if startv != 0 or endv is not None:
1757 page_results = page_results[startv:endv]
1758 res.extend(page_results)
1760 # A little optimization - if current page is not "full", ie. does
1761 # not contain page_size videos then we can assume that this page
1762 # is the last one - there are no more ids on further pages -
1763 # i.e. no need to query again.
1764 if len(page_results) + startv < self._pagesize:
1767 # If we got the whole page, but the next page is not interesting,
1768 # break out early as well
1769 if end == nextfirstid:
1774 class InAdvancePagedList(PagedList):
1775 def __init__(self, pagefunc, pagecount, pagesize):
1776 self._pagefunc = pagefunc
1777 self._pagecount = pagecount
1778 self._pagesize = pagesize
1780 def getslice(self, start=0, end=None):
1782 start_page = start // self._pagesize
1784 self._pagecount if end is None else (end // self._pagesize + 1))
1785 skip_elems = start - start_page * self._pagesize
1786 only_more = None if end is None else end - start
1787 for pagenum in range(start_page, end_page):
1788 page = list(self._pagefunc(pagenum))
1790 page = page[skip_elems:]
1792 if only_more is not None:
1793 if len(page) < only_more:
1794 only_more -= len(page)
1796 page = page[:only_more]
1803 def uppercase_escape(s):
1804 unicode_escape = codecs.getdecoder('unicode_escape')
1806 r'\\U[0-9a-fA-F]{8}',
1807 lambda m: unicode_escape(m.group(0))[0],
1811 def lowercase_escape(s):
1812 unicode_escape = codecs.getdecoder('unicode_escape')
1814 r'\\u[0-9a-fA-F]{4}',
1815 lambda m: unicode_escape(m.group(0))[0],
1819 def escape_rfc3986(s):
1820 """Escape non-ASCII characters as suggested by RFC 3986"""
1821 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1822 s = s.encode('utf-8')
1823 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1826 def escape_url(url):
1827 """Escape URL as suggested by RFC 3986"""
1828 url_parsed = compat_urllib_parse_urlparse(url)
1829 return url_parsed._replace(
1830 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1831 path=escape_rfc3986(url_parsed.path),
1832 params=escape_rfc3986(url_parsed.params),
1833 query=escape_rfc3986(url_parsed.query),
1834 fragment=escape_rfc3986(url_parsed.fragment)
1838 def read_batch_urls(batch_fd):
1840 if not isinstance(url, compat_str):
1841 url = url.decode('utf-8', 'replace')
1842 BOM_UTF8 = '\xef\xbb\xbf'
1843 if url.startswith(BOM_UTF8):
1844 url = url[len(BOM_UTF8):]
1846 if url.startswith(('#', ';', ']')):
1850 with contextlib.closing(batch_fd) as fd:
1851 return [url for url in map(fixup, fd) if url]
1854 def urlencode_postdata(*args, **kargs):
1855 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1858 def update_url_query(url, query):
1861 parsed_url = compat_urlparse.urlparse(url)
1862 qs = compat_parse_qs(parsed_url.query)
1864 return compat_urlparse.urlunparse(parsed_url._replace(
1865 query=compat_urllib_parse_urlencode(qs, True)))
1868 def update_Request(req, url=None, data=None, headers={}, query={}):
1869 req_headers = req.headers.copy()
1870 req_headers.update(headers)
1871 req_data = data or req.data
1872 req_url = update_url_query(url or req.get_full_url(), query)
1873 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1875 req_url, data=req_data, headers=req_headers,
1876 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1877 if hasattr(req, 'timeout'):
1878 new_req.timeout = req.timeout
1882 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1883 if isinstance(key_or_keys, (list, tuple)):
1884 for key in key_or_keys:
1885 if key not in d or d[key] is None or skip_false_values and not d[key]:
1889 return d.get(key_or_keys, default)
1892 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1893 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1905 def parse_age_limit(s):
1908 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1909 return int(m.group('age')) if m else US_RATINGS.get(s)
1912 def strip_jsonp(code):
1914 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1917 def js_to_json(code):
1920 if v in ('true', 'false', 'null'):
1922 elif v.startswith('/*') or v == ',':
1925 if v[0] in ("'", '"'):
1926 v = re.sub(r'(?s)\\.|"', lambda m: {
1931 }.get(m.group(0), m.group(0)), v[1:-1])
1934 (r'^0[xX][0-9a-fA-F]+', 16),
1938 for regex, base in INTEGER_TABLE:
1939 im = re.match(regex, v)
1941 i = int(im.group(0), base)
1942 return '"%d":' % i if v.endswith(':') else '%d' % i
1946 return re.sub(r'''(?sx)
1947 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1948 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1949 /\*.*?\*/|,(?=\s*[\]}])|
1950 [a-zA-Z_][.a-zA-Z_0-9]*|
1951 (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
1956 def qualities(quality_ids):
1957 """ Get a numeric quality value out of a list of possible values """
1960 return quality_ids.index(qid)
1966 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1969 def limit_length(s, length):
1970 """ Add ellipses to overly long strings """
1975 return s[:length - len(ELLIPSES)] + ELLIPSES
1979 def version_tuple(v):
1980 return tuple(int(e) for e in re.split(r'[-.]', v))
1983 def is_outdated_version(version, limit, assume_new=True):
1985 return not assume_new
1987 return version_tuple(version) < version_tuple(limit)
1989 return not assume_new
1992 def ytdl_is_updateable():
1993 """ Returns if youtube-dl can be updated with -U """
1994 from zipimport import zipimporter
1996 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1999 def args_to_str(args):
2000 # Get a short string representation for a subprocess command
2001 return ' '.join(compat_shlex_quote(a) for a in args)
2004 def error_to_compat_str(err):
2006 # On python 2 error byte string must be decoded with proper
2007 # encoding rather than ascii
2008 if sys.version_info[0] < 3:
2009 err_str = err_str.decode(preferredencoding())
2013 def mimetype2ext(mt):
2023 _, _, res = mt.rpartition('/')
2027 'smptett+xml': 'tt',
2033 'x-mp4-fragmented': 'mp4',
2038 def urlhandle_detect_ext(url_handle):
2039 getheader = url_handle.headers.get
2041 cd = getheader('Content-Disposition')
2043 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2045 e = determine_ext(m.group('filename'), default_ext=None)
2049 return mimetype2ext(getheader('Content-Type'))
2052 def encode_data_uri(data, mime_type):
2053 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2056 def age_restricted(content_limit, age_limit):
2057 """ Returns True iff the content should be blocked """
2059 if age_limit is None: # No limit set
2061 if content_limit is None:
2062 return False # Content available for everyone
2063 return age_limit < content_limit
2066 def is_html(first_bytes):
2067 """ Detect whether a file contains HTML by examining its first bytes. """
2070 (b'\xef\xbb\xbf', 'utf-8'),
2071 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2072 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2073 (b'\xff\xfe', 'utf-16-le'),
2074 (b'\xfe\xff', 'utf-16-be'),
2076 for bom, enc in BOMS:
2077 if first_bytes.startswith(bom):
2078 s = first_bytes[len(bom):].decode(enc, 'replace')
2081 s = first_bytes.decode('utf-8', 'replace')
2083 return re.match(r'^\s*<', s)
2086 def determine_protocol(info_dict):
2087 protocol = info_dict.get('protocol')
2088 if protocol is not None:
2091 url = info_dict['url']
2092 if url.startswith('rtmp'):
2094 elif url.startswith('mms'):
2096 elif url.startswith('rtsp'):
2099 ext = determine_ext(url)
2105 return compat_urllib_parse_urlparse(url).scheme
2108 def render_table(header_row, data):
2109 """ Render a list of rows, each as a list of values """
2110 table = [header_row] + data
2111 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2112 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2113 return '\n'.join(format_str % tuple(row) for row in table)
2116 def _match_one(filter_part, dct):
2117 COMPARISON_OPERATORS = {
2125 operator_rex = re.compile(r'''(?x)\s*
2127 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2129 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2130 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2133 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2134 m = operator_rex.search(filter_part)
2136 op = COMPARISON_OPERATORS[m.group('op')]
2137 if m.group('strval') is not None:
2138 if m.group('op') not in ('=', '!='):
2140 'Operator %s does not support string values!' % m.group('op'))
2141 comparison_value = m.group('strval')
2144 comparison_value = int(m.group('intval'))
2146 comparison_value = parse_filesize(m.group('intval'))
2147 if comparison_value is None:
2148 comparison_value = parse_filesize(m.group('intval') + 'B')
2149 if comparison_value is None:
2151 'Invalid integer value %r in filter part %r' % (
2152 m.group('intval'), filter_part))
2153 actual_value = dct.get(m.group('key'))
2154 if actual_value is None:
2155 return m.group('none_inclusive')
2156 return op(actual_value, comparison_value)
2159 '': lambda v: v is not None,
2160 '!': lambda v: v is None,
2162 operator_rex = re.compile(r'''(?x)\s*
2163 (?P<op>%s)\s*(?P<key>[a-z_]+)
2165 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2166 m = operator_rex.search(filter_part)
2168 op = UNARY_OPERATORS[m.group('op')]
2169 actual_value = dct.get(m.group('key'))
2170 return op(actual_value)
2172 raise ValueError('Invalid filter part %r' % filter_part)
2175 def match_str(filter_str, dct):
2176 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2179 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2182 def match_filter_func(filter_str):
2183 def _match_func(info_dict):
2184 if match_str(filter_str, info_dict):
2187 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2188 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2192 def parse_dfxp_time_expr(time_expr):
2196 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2198 return float(mobj.group('time_offset'))
2200 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2202 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2205 def srt_subtitles_timecode(seconds):
2206 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2209 def dfxp2srt(dfxp_data):
2210 _x = functools.partial(xpath_with_ns, ns_map={
2211 'ttml': 'http://www.w3.org/ns/ttml',
2212 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2213 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2216 class TTMLPElementParser(object):
2219 def start(self, tag, attrib):
2220 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2226 def data(self, data):
2230 return self.out.strip()
2232 def parse_node(node):
2233 target = TTMLPElementParser()
2234 parser = xml.etree.ElementTree.XMLParser(target=target)
2235 parser.feed(xml.etree.ElementTree.tostring(node))
2236 return parser.close()
2238 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2240 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2243 raise ValueError('Invalid dfxp/TTML subtitle')
2245 for para, index in zip(paras, itertools.count(1)):
2246 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2247 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2248 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2249 if begin_time is None:
2254 end_time = begin_time + dur
2255 out.append('%d\n%s --> %s\n%s\n\n' % (
2257 srt_subtitles_timecode(begin_time),
2258 srt_subtitles_timecode(end_time),
2264 def cli_option(params, command_option, param):
2265 param = params.get(param)
2266 return [command_option, param] if param is not None else []
2269 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2270 param = params.get(param)
2271 assert isinstance(param, bool)
2273 return [command_option + separator + (true_value if param else false_value)]
2274 return [command_option, true_value if param else false_value]
2277 def cli_valueless_option(params, command_option, param, expected_value=True):
2278 param = params.get(param)
2279 return [command_option] if param == expected_value else []
2282 def cli_configuration_args(params, param, default=[]):
2283 ex_args = params.get(param)
2286 assert isinstance(ex_args, list)
2290 class ISO639Utils(object):
2291 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2480 def short2long(cls, code):
2481 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2482 return cls._lang_map.get(code[:2])
2485 def long2short(cls, code):
2486 """Convert language code from ISO 639-2/T to ISO 639-1"""
2487 for short_name, long_name in cls._lang_map.items():
2488 if long_name == code:
2492 class ISO3166Utils(object):
2493 # From http://data.okfn.org/data/core/country-list
2495 'AF': 'Afghanistan',
2496 'AX': 'Åland Islands',
2499 'AS': 'American Samoa',
2504 'AG': 'Antigua and Barbuda',
2521 'BO': 'Bolivia, Plurinational State of',
2522 'BQ': 'Bonaire, Sint Eustatius and Saba',
2523 'BA': 'Bosnia and Herzegovina',
2525 'BV': 'Bouvet Island',
2527 'IO': 'British Indian Ocean Territory',
2528 'BN': 'Brunei Darussalam',
2530 'BF': 'Burkina Faso',
2536 'KY': 'Cayman Islands',
2537 'CF': 'Central African Republic',
2541 'CX': 'Christmas Island',
2542 'CC': 'Cocos (Keeling) Islands',
2546 'CD': 'Congo, the Democratic Republic of the',
2547 'CK': 'Cook Islands',
2549 'CI': 'Côte d\'Ivoire',
2554 'CZ': 'Czech Republic',
2558 'DO': 'Dominican Republic',
2561 'SV': 'El Salvador',
2562 'GQ': 'Equatorial Guinea',
2566 'FK': 'Falkland Islands (Malvinas)',
2567 'FO': 'Faroe Islands',
2571 'GF': 'French Guiana',
2572 'PF': 'French Polynesia',
2573 'TF': 'French Southern Territories',
2588 'GW': 'Guinea-Bissau',
2591 'HM': 'Heard Island and McDonald Islands',
2592 'VA': 'Holy See (Vatican City State)',
2599 'IR': 'Iran, Islamic Republic of',
2602 'IM': 'Isle of Man',
2612 'KP': 'Korea, Democratic People\'s Republic of',
2613 'KR': 'Korea, Republic of',
2616 'LA': 'Lao People\'s Democratic Republic',
2622 'LI': 'Liechtenstein',
2626 'MK': 'Macedonia, the Former Yugoslav Republic of',
2633 'MH': 'Marshall Islands',
2639 'FM': 'Micronesia, Federated States of',
2640 'MD': 'Moldova, Republic of',
2651 'NL': 'Netherlands',
2652 'NC': 'New Caledonia',
2653 'NZ': 'New Zealand',
2658 'NF': 'Norfolk Island',
2659 'MP': 'Northern Mariana Islands',
2664 'PS': 'Palestine, State of',
2666 'PG': 'Papua New Guinea',
2669 'PH': 'Philippines',
2673 'PR': 'Puerto Rico',
2677 'RU': 'Russian Federation',
2679 'BL': 'Saint Barthélemy',
2680 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2681 'KN': 'Saint Kitts and Nevis',
2682 'LC': 'Saint Lucia',
2683 'MF': 'Saint Martin (French part)',
2684 'PM': 'Saint Pierre and Miquelon',
2685 'VC': 'Saint Vincent and the Grenadines',
2688 'ST': 'Sao Tome and Principe',
2689 'SA': 'Saudi Arabia',
2693 'SL': 'Sierra Leone',
2695 'SX': 'Sint Maarten (Dutch part)',
2698 'SB': 'Solomon Islands',
2700 'ZA': 'South Africa',
2701 'GS': 'South Georgia and the South Sandwich Islands',
2702 'SS': 'South Sudan',
2707 'SJ': 'Svalbard and Jan Mayen',
2710 'CH': 'Switzerland',
2711 'SY': 'Syrian Arab Republic',
2712 'TW': 'Taiwan, Province of China',
2714 'TZ': 'Tanzania, United Republic of',
2716 'TL': 'Timor-Leste',
2720 'TT': 'Trinidad and Tobago',
2723 'TM': 'Turkmenistan',
2724 'TC': 'Turks and Caicos Islands',
2728 'AE': 'United Arab Emirates',
2729 'GB': 'United Kingdom',
2730 'US': 'United States',
2731 'UM': 'United States Minor Outlying Islands',
2735 'VE': 'Venezuela, Bolivarian Republic of',
2737 'VG': 'Virgin Islands, British',
2738 'VI': 'Virgin Islands, U.S.',
2739 'WF': 'Wallis and Futuna',
2740 'EH': 'Western Sahara',
2747 def short2full(cls, code):
2748 """Convert an ISO 3166-2 country code to the corresponding full name"""
2749 return cls._country_map.get(code.upper())
2752 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2753 def __init__(self, proxies=None):
2754 # Set default handlers
2755 for type in ('http', 'https'):
2756 setattr(self, '%s_open' % type,
2757 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2758 meth(r, proxy, type))
2759 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2761 def proxy_open(self, req, proxy, type):
2762 req_proxy = req.headers.get('Ytdl-request-proxy')
2763 if req_proxy is not None:
2765 del req.headers['Ytdl-request-proxy']
2767 if proxy == '__noproxy__':
2768 return None # No Proxy
2769 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2770 req.add_header('Ytdl-socks-proxy', proxy)
2771 # youtube-dl's http/https handlers do wrapping the socket with socks
2773 return compat_urllib_request.ProxyHandler.proxy_open(
2774 self, req, proxy, type)
2777 def ohdave_rsa_encrypt(data, exponent, modulus):
2779 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2782 data: data to encrypt, bytes-like object
2783 exponent, modulus: parameter e and N of RSA algorithm, both integer
2784 Output: hex string of encrypted data
2786 Limitation: supports one block encryption only
2789 payload = int(binascii.hexlify(data[::-1]), 16)
2790 encrypted = pow(payload, exponent, modulus)
2791 return '%x' % encrypted
2794 def encode_base_n(num, n, table=None):
2795 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2797 table = FULL_TABLE[:n]
2800 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2807 ret = table[num % n] + ret
2812 def decode_packed_codes(code):
2814 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2816 obfucasted_code, base, count, symbols = mobj.groups()
2819 symbols = symbols.split('|')
2824 base_n_count = encode_base_n(count, base)
2825 symbol_table[base_n_count] = symbols[count] or base_n_count
2828 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],