2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
46 compat_socket_create_connection,
51 compat_urllib_parse_urlencode,
52 compat_urllib_parse_urlparse,
53 compat_urllib_parse_unquote_plus,
54 compat_urllib_request,
65 def register_socks_protocols():
66 # "Register" SOCKS protocols
67 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
68 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
69 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
70 if scheme not in compat_urlparse.uses_netloc:
71 compat_urlparse.uses_netloc.append(scheme)
74 # This is not clearly defined otherwise
75 compiled_regex_type = type(re.compile(''))
78 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
79 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
80 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
81 'Accept-Encoding': 'gzip, deflate',
82 'Accept-Language': 'en-us,en;q=0.5',
88 ENGLISH_MONTH_NAMES = [
89 'January', 'February', 'March', 'April', 'May', 'June',
90 'July', 'August', 'September', 'October', 'November', 'December']
93 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
94 'flv', 'f4v', 'f4a', 'f4b',
95 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
105 'f4f', 'f4m', 'm3u8', 'smil')
107 # needed for sanitizing filenames in restricted mode
108 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
109 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
110 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
113 def preferredencoding():
114 """Get preferred encoding.
116 Returns the best encoding scheme for the system, based on
117 locale.getpreferredencoding() and some further tweaks.
120 pref = locale.getpreferredencoding()
128 def write_json_file(obj, fn):
129 """ Encode obj as JSON and write it to fn, atomically if possible """
131 fn = encodeFilename(fn)
132 if sys.version_info < (3, 0) and sys.platform != 'win32':
133 encoding = get_filesystem_encoding()
134 # os.path.basename returns a bytes object, but NamedTemporaryFile
135 # will fail if the filename contains non ascii characters unless we
136 # use a unicode object
137 path_basename = lambda f: os.path.basename(fn).decode(encoding)
138 # the same for os.path.dirname
139 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
141 path_basename = os.path.basename
142 path_dirname = os.path.dirname
146 'prefix': path_basename(fn) + '.',
147 'dir': path_dirname(fn),
151 # In Python 2.x, json.dump expects a bytestream.
152 # In Python 3.x, it writes to a character stream
153 if sys.version_info < (3, 0):
161 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
166 if sys.platform == 'win32':
167 # Need to remove existing file on Windows, else os.rename raises
168 # WindowsError or FileExistsError.
173 os.rename(tf.name, fn)
182 if sys.version_info >= (2, 7):
183 def find_xpath_attr(node, xpath, key, val=None):
184 """ Find the xpath xpath[@key=val] """
185 assert re.match(r'^[a-zA-Z_-]+$', key)
186 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
187 return node.find(expr)
189 def find_xpath_attr(node, xpath, key, val=None):
190 for f in node.findall(compat_xpath(xpath)):
191 if key not in f.attrib:
193 if val is None or f.attrib.get(key) == val:
197 # On python2.6 the xml.etree.ElementTree.Element methods don't support
198 # the namespace parameter
201 def xpath_with_ns(path, ns_map):
202 components = [c.split(':') for c in path.split('/')]
206 replaced.append(c[0])
209 replaced.append('{%s}%s' % (ns_map[ns], tag))
210 return '/'.join(replaced)
213 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
214 def _find_xpath(xpath):
215 return node.find(compat_xpath(xpath))
217 if isinstance(xpath, (str, compat_str)):
218 n = _find_xpath(xpath)
226 if default is not NO_DEFAULT:
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element %s' % name)
236 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
237 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
238 if n is None or n == default:
241 if default is not NO_DEFAULT:
244 name = xpath if name is None else name
245 raise ExtractorError('Could not find XML element\'s text %s' % name)
251 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
252 n = find_xpath_attr(node, xpath, key)
254 if default is not NO_DEFAULT:
257 name = '%s[@%s]' % (xpath, key) if name is None else name
258 raise ExtractorError('Could not find XML attribute %s' % name)
264 def get_element_by_id(id, html):
265 """Return the content of the tag with the specified ID in the passed HTML document"""
266 return get_element_by_attribute('id', id, html)
269 def get_element_by_attribute(attribute, value, html):
270 """Return the content of the tag with the specified attribute in the passed HTML document"""
272 m = re.search(r'''(?xs)
274 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
276 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
280 ''' % (re.escape(attribute), re.escape(value)), html)
284 res = m.group('content')
286 if res.startswith('"') or res.startswith("'"):
289 return unescapeHTML(res)
292 class HTMLAttributeParser(compat_HTMLParser):
293 """Trivial HTML parser to gather the attributes for a single element"""
296 compat_HTMLParser.__init__(self)
298 def handle_starttag(self, tag, attrs):
299 self.attrs = dict(attrs)
302 def extract_attributes(html_element):
303 """Given a string for an HTML element such as
305 a="foo" B="bar" c="&98;az" d=boz
306 empty= noval entity="&"
309 Decode and return a dictionary of attributes.
311 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
312 'empty': '', 'noval': None, 'entity': '&',
313 'sq': '"', 'dq': '\''
315 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
316 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
318 parser = HTMLAttributeParser()
319 parser.feed(html_element)
324 def clean_html(html):
325 """Clean an HTML snippet into a readable string"""
327 if html is None: # Convenience for sanitizing descriptions etc.
331 html = html.replace('\n', ' ')
332 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
333 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
335 html = re.sub('<.*?>', '', html)
336 # Replace html entities
337 html = unescapeHTML(html)
341 def sanitize_open(filename, open_mode):
342 """Try to open the given filename, and slightly tweak it if this fails.
344 Attempts to open the given filename. If this fails, it tries to change
345 the filename slightly, step by step, until it's either able to open it
346 or it fails and raises a final exception, like the standard open()
349 It returns the tuple (stream, definitive_file_name).
353 if sys.platform == 'win32':
355 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
356 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, filename)
359 except (IOError, OSError) as err:
360 if err.errno in (errno.EACCES,):
363 # In case of error, try to remove win32 forbidden chars
364 alt_filename = sanitize_path(filename)
365 if alt_filename == filename:
368 # An exception here should be caught in the caller
369 stream = open(encodeFilename(alt_filename), open_mode)
370 return (stream, alt_filename)
373 def timeconvert(timestr):
374 """Convert RFC 2822 defined time string into system timestamp"""
376 timetuple = email.utils.parsedate_tz(timestr)
377 if timetuple is not None:
378 timestamp = email.utils.mktime_tz(timetuple)
382 def sanitize_filename(s, restricted=False, is_id=False):
383 """Sanitizes a string so it could be used as part of a filename.
384 If restricted is set, use a stricter subset of allowed characters.
385 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
387 def replace_insane(char):
388 if restricted and char in ACCENT_CHARS:
389 return ACCENT_CHARS[char]
390 if char == '?' or ord(char) < 32 or ord(char) == 127:
393 return '' if restricted else '\''
395 return '_-' if restricted else ' -'
396 elif char in '\\/|*<>':
398 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
400 if restricted and ord(char) > 127:
405 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
406 result = ''.join(map(replace_insane, s))
408 while '__' in result:
409 result = result.replace('__', '_')
410 result = result.strip('_')
411 # Common case of "Foreign band name - English song title"
412 if restricted and result.startswith('-_'):
414 if result.startswith('-'):
415 result = '_' + result[len('-'):]
416 result = result.lstrip('.')
422 def sanitize_path(s):
423 """Sanitizes and normalizes path on Windows"""
424 if sys.platform != 'win32':
426 drive_or_unc, _ = os.path.splitdrive(s)
427 if sys.version_info < (2, 7) and not drive_or_unc:
428 drive_or_unc, _ = os.path.splitunc(s)
429 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
433 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
434 for path_part in norm_path]
436 sanitized_path.insert(0, drive_or_unc + os.path.sep)
437 return os.path.join(*sanitized_path)
440 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
441 # unwanted failures due to missing protocol
442 def sanitize_url(url):
443 return 'http:%s' % url if url.startswith('//') else url
446 def sanitized_Request(url, *args, **kwargs):
447 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
450 def orderedSet(iterable):
451 """ Remove all duplicates from the input iterable """
459 def _htmlentity_transform(entity):
460 """Transforms an HTML entity to a character."""
461 # Known non-numeric HTML entity
462 if entity in compat_html_entities.name2codepoint:
463 return compat_chr(compat_html_entities.name2codepoint[entity])
465 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
467 numstr = mobj.group(1)
468 if numstr.startswith('x'):
470 numstr = '0%s' % numstr
473 # See https://github.com/rg3/youtube-dl/issues/7518
475 return compat_chr(int(numstr, base))
479 # Unknown entity in name, return its literal representation
480 return '&%s;' % entity
486 assert type(s) == compat_str
489 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
492 def get_subprocess_encoding():
493 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
494 # For subprocess calls, encode with locale encoding
495 # Refer to http://stackoverflow.com/a/9951851/35070
496 encoding = preferredencoding()
498 encoding = sys.getfilesystemencoding()
504 def encodeFilename(s, for_subprocess=False):
506 @param s The name of the file
509 assert type(s) == compat_str
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
515 # Pass '' directly to use Unicode APIs on Windows 2000 and up
516 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
517 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
518 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
521 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
522 if sys.platform.startswith('java'):
525 return s.encode(get_subprocess_encoding(), 'ignore')
528 def decodeFilename(b, for_subprocess=False):
530 if sys.version_info >= (3, 0):
533 if not isinstance(b, bytes):
536 return b.decode(get_subprocess_encoding(), 'ignore')
539 def encodeArgument(s):
540 if not isinstance(s, compat_str):
541 # Legacy code that uses byte strings
542 # Uncomment the following line after fixing all post processors
543 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
544 s = s.decode('ascii')
545 return encodeFilename(s, True)
548 def decodeArgument(b):
549 return decodeFilename(b, True)
552 def decodeOption(optval):
555 if isinstance(optval, bytes):
556 optval = optval.decode(preferredencoding())
558 assert isinstance(optval, compat_str)
562 def formatSeconds(secs):
564 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
566 return '%d:%02d' % (secs // 60, secs % 60)
571 def make_HTTPS_handler(params, **kwargs):
572 opts_no_check_certificate = params.get('nocheckcertificate', False)
573 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
574 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
575 if opts_no_check_certificate:
576 context.check_hostname = False
577 context.verify_mode = ssl.CERT_NONE
579 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
582 # (create_default_context present but HTTPSHandler has no context=)
585 if sys.version_info < (3, 2):
586 return YoutubeDLHTTPSHandler(params, **kwargs)
588 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
589 context.verify_mode = (ssl.CERT_NONE
590 if opts_no_check_certificate
591 else ssl.CERT_REQUIRED)
592 context.set_default_verify_paths()
593 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
596 def bug_reports_message():
597 if ytdl_is_updateable():
598 update_cmd = 'type youtube-dl -U to update'
600 update_cmd = 'see https://yt-dl.org/update on how to update'
601 msg = '; please report this issue on https://yt-dl.org/bug .'
602 msg += ' Make sure you are using the latest version; %s.' % update_cmd
603 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
607 class ExtractorError(Exception):
608 """Error during info extraction."""
610 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
611 """ tb, if given, is the original traceback (so that it can be printed out).
612 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
615 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
617 if video_id is not None:
618 msg = video_id + ': ' + msg
620 msg += ' (caused by %r)' % cause
622 msg += bug_reports_message()
623 super(ExtractorError, self).__init__(msg)
626 self.exc_info = sys.exc_info() # preserve original exception
628 self.video_id = video_id
630 def format_traceback(self):
631 if self.traceback is None:
633 return ''.join(traceback.format_tb(self.traceback))
636 class UnsupportedError(ExtractorError):
637 def __init__(self, url):
638 super(UnsupportedError, self).__init__(
639 'Unsupported URL: %s' % url, expected=True)
643 class RegexNotFoundError(ExtractorError):
644 """Error when a regex didn't match"""
648 class DownloadError(Exception):
649 """Download Error exception.
651 This exception may be thrown by FileDownloader objects if they are not
652 configured to continue on errors. They will contain the appropriate
656 def __init__(self, msg, exc_info=None):
657 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
658 super(DownloadError, self).__init__(msg)
659 self.exc_info = exc_info
662 class SameFileError(Exception):
663 """Same File exception.
665 This exception will be thrown by FileDownloader objects if they detect
666 multiple files would have to be downloaded to the same file on disk.
671 class PostProcessingError(Exception):
672 """Post Processing exception.
674 This exception may be raised by PostProcessor's .run() method to
675 indicate an error in the postprocessing task.
678 def __init__(self, msg):
682 class MaxDownloadsReached(Exception):
683 """ --max-downloads limit has been reached. """
687 class UnavailableVideoError(Exception):
688 """Unavailable Format exception.
690 This exception will be thrown when a video is requested
691 in a format that is not available for that video.
696 class ContentTooShortError(Exception):
697 """Content Too Short exception.
699 This exception may be raised by FileDownloader objects when a file they
700 download is too small for what the server announced first, indicating
701 the connection was probably interrupted.
704 def __init__(self, downloaded, expected):
706 self.downloaded = downloaded
707 self.expected = expected
710 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
711 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
712 # expected HTTP responses to meet HTTP/1.0 or later (see also
713 # https://github.com/rg3/youtube-dl/issues/6727)
714 if sys.version_info < (3, 0):
715 kwargs[b'strict'] = True
716 hc = http_class(*args, **kwargs)
717 source_address = ydl_handler._params.get('source_address')
718 if source_address is not None:
719 sa = (source_address, 0)
720 if hasattr(hc, 'source_address'): # Python 2.7+
721 hc.source_address = sa
723 def _hc_connect(self, *args, **kwargs):
724 sock = compat_socket_create_connection(
725 (self.host, self.port), self.timeout, sa)
727 self.sock = ssl.wrap_socket(
728 sock, self.key_file, self.cert_file,
729 ssl_version=ssl.PROTOCOL_TLSv1)
732 hc.connect = functools.partial(_hc_connect, hc)
737 def handle_youtubedl_headers(headers):
738 filtered_headers = headers
740 if 'Youtubedl-no-compression' in filtered_headers:
741 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
742 del filtered_headers['Youtubedl-no-compression']
744 return filtered_headers
747 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
748 """Handler for HTTP requests and responses.
750 This class, when installed with an OpenerDirector, automatically adds
751 the standard headers to every HTTP request and handles gzipped and
752 deflated responses from web servers. If compression is to be avoided in
753 a particular request, the original request in the program code only has
754 to include the HTTP header "Youtubedl-no-compression", which will be
755 removed before making the real request.
757 Part of this code was copied from:
759 http://techknack.net/python-urllib2-handlers/
761 Andrew Rowls, the author of that code, agreed to release it to the
765 def __init__(self, params, *args, **kwargs):
766 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
767 self._params = params
769 def http_open(self, req):
770 conn_class = compat_http_client.HTTPConnection
772 socks_proxy = req.headers.get('Ytdl-socks-proxy')
774 conn_class = make_socks_conn_class(conn_class, socks_proxy)
775 del req.headers['Ytdl-socks-proxy']
777 return self.do_open(functools.partial(
778 _create_http_connection, self, conn_class, False),
784 return zlib.decompress(data, -zlib.MAX_WBITS)
786 return zlib.decompress(data)
789 def addinfourl_wrapper(stream, headers, url, code):
790 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
791 return compat_urllib_request.addinfourl(stream, headers, url, code)
792 ret = compat_urllib_request.addinfourl(stream, headers, url)
796 def http_request(self, req):
797 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
798 # always respected by websites, some tend to give out URLs with non percent-encoded
799 # non-ASCII characters (see telemb.py, ard.py [#3412])
800 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
801 # To work around aforementioned issue we will replace request's original URL with
802 # percent-encoded one
803 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
804 # the code of this workaround has been moved here from YoutubeDL.urlopen()
805 url = req.get_full_url()
806 url_escaped = escape_url(url)
808 # Substitute URL if any change after escaping
809 if url != url_escaped:
810 req = update_Request(req, url=url_escaped)
812 for h, v in std_headers.items():
813 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
814 # The dict keys are capitalized because of this bug by urllib
815 if h.capitalize() not in req.headers:
818 req.headers = handle_youtubedl_headers(req.headers)
820 if sys.version_info < (2, 7) and '#' in req.get_full_url():
821 # Python 2.6 is brain-dead when it comes to fragments
822 req._Request__original = req._Request__original.partition('#')[0]
823 req._Request__r_type = req._Request__r_type.partition('#')[0]
827 def http_response(self, req, resp):
830 if resp.headers.get('Content-encoding', '') == 'gzip':
831 content = resp.read()
832 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
834 uncompressed = io.BytesIO(gz.read())
835 except IOError as original_ioerror:
836 # There may be junk add the end of the file
837 # See http://stackoverflow.com/q/4928560/35070 for details
838 for i in range(1, 1024):
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
841 uncompressed = io.BytesIO(gz.read())
846 raise original_ioerror
847 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
848 resp.msg = old_resp.msg
849 del resp.headers['Content-encoding']
851 if resp.headers.get('Content-encoding', '') == 'deflate':
852 gz = io.BytesIO(self.deflate(resp.read()))
853 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
854 resp.msg = old_resp.msg
855 del resp.headers['Content-encoding']
856 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
857 # https://github.com/rg3/youtube-dl/issues/6457).
858 if 300 <= resp.code < 400:
859 location = resp.headers.get('Location')
861 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
862 if sys.version_info >= (3, 0):
863 location = location.encode('iso-8859-1').decode('utf-8')
865 location = location.decode('utf-8')
866 location_escaped = escape_url(location)
867 if location != location_escaped:
868 del resp.headers['Location']
869 if sys.version_info < (3, 0):
870 location_escaped = location_escaped.encode('utf-8')
871 resp.headers['Location'] = location_escaped
874 https_request = http_request
875 https_response = http_response
878 def make_socks_conn_class(base_class, socks_proxy):
879 assert issubclass(base_class, (
880 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
882 url_components = compat_urlparse.urlparse(socks_proxy)
883 if url_components.scheme.lower() == 'socks5':
884 socks_type = ProxyType.SOCKS5
885 elif url_components.scheme.lower() in ('socks', 'socks4'):
886 socks_type = ProxyType.SOCKS4
887 elif url_components.scheme.lower() == 'socks4a':
888 socks_type = ProxyType.SOCKS4A
890 def unquote_if_non_empty(s):
893 return compat_urllib_parse_unquote_plus(s)
897 url_components.hostname, url_components.port or 1080,
899 unquote_if_non_empty(url_components.username),
900 unquote_if_non_empty(url_components.password),
903 class SocksConnection(base_class):
905 self.sock = sockssocket()
906 self.sock.setproxy(*proxy_args)
907 if type(self.timeout) in (int, float):
908 self.sock.settimeout(self.timeout)
909 self.sock.connect((self.host, self.port))
911 if isinstance(self, compat_http_client.HTTPSConnection):
912 if hasattr(self, '_context'): # Python > 2.6
913 self.sock = self._context.wrap_socket(
914 self.sock, server_hostname=self.host)
916 self.sock = ssl.wrap_socket(self.sock)
918 return SocksConnection
921 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
922 def __init__(self, params, https_conn_class=None, *args, **kwargs):
923 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
924 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
925 self._params = params
927 def https_open(self, req):
929 conn_class = self._https_conn_class
931 if hasattr(self, '_context'): # python > 2.6
932 kwargs['context'] = self._context
933 if hasattr(self, '_check_hostname'): # python 3.x
934 kwargs['check_hostname'] = self._check_hostname
936 socks_proxy = req.headers.get('Ytdl-socks-proxy')
938 conn_class = make_socks_conn_class(conn_class, socks_proxy)
939 del req.headers['Ytdl-socks-proxy']
941 return self.do_open(functools.partial(
942 _create_http_connection, self, conn_class, True),
946 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
947 def __init__(self, cookiejar=None):
948 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
950 def http_response(self, request, response):
951 # Python 2 will choke on next HTTP request in row if there are non-ASCII
952 # characters in Set-Cookie HTTP header of last response (see
953 # https://github.com/rg3/youtube-dl/issues/6769).
954 # In order to at least prevent crashing we will percent encode Set-Cookie
955 # header before HTTPCookieProcessor starts processing it.
956 # if sys.version_info < (3, 0) and response.headers:
957 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
958 # set_cookie = response.headers.get(set_cookie_header)
960 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
961 # if set_cookie != set_cookie_escaped:
962 # del response.headers[set_cookie_header]
963 # response.headers[set_cookie_header] = set_cookie_escaped
964 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
966 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
967 https_response = http_response
970 def parse_iso8601(date_str, delimiter='T', timezone=None):
971 """ Return a UNIX timestamp from the given date """
976 date_str = re.sub(r'\.[0-9]+', '', date_str)
980 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
983 timezone = datetime.timedelta()
985 date_str = date_str[:-len(m.group(0))]
986 if not m.group('sign'):
987 timezone = datetime.timedelta()
989 sign = 1 if m.group('sign') == '+' else -1
990 timezone = datetime.timedelta(
991 hours=sign * int(m.group('hours')),
992 minutes=sign * int(m.group('minutes')))
994 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
995 dt = datetime.datetime.strptime(date_str, date_format) - timezone
996 return calendar.timegm(dt.timetuple())
1001 def unified_strdate(date_str, day_first=True):
1002 """Return a string with the date in the format YYYYMMDD"""
1004 if date_str is None:
1008 date_str = date_str.replace(',', ' ')
1009 # %z (UTC offset) is only supported in python>=3.2
1010 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1011 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1012 # Remove AM/PM + timezone
1013 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1015 format_expressions = [
1026 '%Y/%m/%d %H:%M:%S',
1027 '%Y-%m-%d %H:%M:%S',
1028 '%Y-%m-%d %H:%M:%S.%f',
1031 '%Y-%m-%dT%H:%M:%SZ',
1032 '%Y-%m-%dT%H:%M:%S.%fZ',
1033 '%Y-%m-%dT%H:%M:%S.%f0Z',
1034 '%Y-%m-%dT%H:%M:%S',
1035 '%Y-%m-%dT%H:%M:%S.%f',
1039 format_expressions.extend([
1045 '%d/%m/%Y %H:%M:%S',
1048 format_expressions.extend([
1053 '%m/%d/%Y %H:%M:%S',
1055 for expression in format_expressions:
1057 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1060 if upload_date is None:
1061 timetuple = email.utils.parsedate_tz(date_str)
1064 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1067 if upload_date is not None:
1068 return compat_str(upload_date)
1071 def determine_ext(url, default_ext='unknown_video'):
1074 guess = url.partition('?')[0].rpartition('.')[2]
1075 if re.match(r'^[A-Za-z0-9]+$', guess):
1077 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1078 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1079 return guess.rstrip('/')
1084 def subtitles_filename(filename, sub_lang, sub_format):
1085 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1088 def date_from_str(date_str):
1090 Return a datetime object from a string in the format YYYYMMDD or
1091 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1092 today = datetime.date.today()
1093 if date_str in ('now', 'today'):
1095 if date_str == 'yesterday':
1096 return today - datetime.timedelta(days=1)
1097 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1098 if match is not None:
1099 sign = match.group('sign')
1100 time = int(match.group('time'))
1103 unit = match.group('unit')
1104 # A bad approximation?
1108 elif unit == 'year':
1112 delta = datetime.timedelta(**{unit: time})
1113 return today + delta
1114 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1117 def hyphenate_date(date_str):
1119 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1120 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1121 if match is not None:
1122 return '-'.join(match.groups())
1127 class DateRange(object):
1128 """Represents a time interval between two dates"""
1130 def __init__(self, start=None, end=None):
1131 """start and end must be strings in the format accepted by date"""
1132 if start is not None:
1133 self.start = date_from_str(start)
1135 self.start = datetime.datetime.min.date()
1137 self.end = date_from_str(end)
1139 self.end = datetime.datetime.max.date()
1140 if self.start > self.end:
1141 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1145 """Returns a range that only contains the given day"""
1146 return cls(day, day)
1148 def __contains__(self, date):
1149 """Check if the date is in the range"""
1150 if not isinstance(date, datetime.date):
1151 date = date_from_str(date)
1152 return self.start <= date <= self.end
1155 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1158 def platform_name():
1159 """ Returns the platform name as a compat_str """
1160 res = platform.platform()
1161 if isinstance(res, bytes):
1162 res = res.decode(preferredencoding())
1164 assert isinstance(res, compat_str)
1168 def _windows_write_string(s, out):
1169 """ Returns True if the string was written using special methods,
1170 False if it has yet to be written out."""
1171 # Adapted from http://stackoverflow.com/a/3259271/35070
1174 import ctypes.wintypes
1182 fileno = out.fileno()
1183 except AttributeError:
1184 # If the output stream doesn't have a fileno, it's virtual
1186 except io.UnsupportedOperation:
1187 # Some strange Windows pseudo files?
1189 if fileno not in WIN_OUTPUT_IDS:
1192 GetStdHandle = ctypes.WINFUNCTYPE(
1193 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1194 (b'GetStdHandle', ctypes.windll.kernel32))
1195 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1197 WriteConsoleW = ctypes.WINFUNCTYPE(
1198 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1199 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1200 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1201 written = ctypes.wintypes.DWORD(0)
1203 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1204 FILE_TYPE_CHAR = 0x0002
1205 FILE_TYPE_REMOTE = 0x8000
1206 GetConsoleMode = ctypes.WINFUNCTYPE(
1207 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1208 ctypes.POINTER(ctypes.wintypes.DWORD))(
1209 (b'GetConsoleMode', ctypes.windll.kernel32))
1210 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1212 def not_a_console(handle):
1213 if handle == INVALID_HANDLE_VALUE or handle is None:
1215 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1216 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1218 if not_a_console(h):
1221 def next_nonbmp_pos(s):
1223 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1224 except StopIteration:
1228 count = min(next_nonbmp_pos(s), 1024)
1230 ret = WriteConsoleW(
1231 h, s, count if count else 2, ctypes.byref(written), None)
1233 raise OSError('Failed to write string')
1234 if not count: # We just wrote a non-BMP character
1235 assert written.value == 2
1238 assert written.value > 0
1239 s = s[written.value:]
1243 def write_string(s, out=None, encoding=None):
1246 assert type(s) == compat_str
1248 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1249 if _windows_write_string(s, out):
1252 if ('b' in getattr(out, 'mode', '') or
1253 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1254 byt = s.encode(encoding or preferredencoding(), 'ignore')
1256 elif hasattr(out, 'buffer'):
1257 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1258 byt = s.encode(enc, 'ignore')
1259 out.buffer.write(byt)
1265 def bytes_to_intlist(bs):
1268 if isinstance(bs[0], int): # Python 3
1271 return [ord(c) for c in bs]
1274 def intlist_to_bytes(xs):
1277 return compat_struct_pack('%dB' % len(xs), *xs)
1280 # Cross-platform file locking
1281 if sys.platform == 'win32':
1282 import ctypes.wintypes
1285 class OVERLAPPED(ctypes.Structure):
1287 ('Internal', ctypes.wintypes.LPVOID),
1288 ('InternalHigh', ctypes.wintypes.LPVOID),
1289 ('Offset', ctypes.wintypes.DWORD),
1290 ('OffsetHigh', ctypes.wintypes.DWORD),
1291 ('hEvent', ctypes.wintypes.HANDLE),
1294 kernel32 = ctypes.windll.kernel32
1295 LockFileEx = kernel32.LockFileEx
1296 LockFileEx.argtypes = [
1297 ctypes.wintypes.HANDLE, # hFile
1298 ctypes.wintypes.DWORD, # dwFlags
1299 ctypes.wintypes.DWORD, # dwReserved
1300 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1301 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1302 ctypes.POINTER(OVERLAPPED) # Overlapped
1304 LockFileEx.restype = ctypes.wintypes.BOOL
1305 UnlockFileEx = kernel32.UnlockFileEx
1306 UnlockFileEx.argtypes = [
1307 ctypes.wintypes.HANDLE, # hFile
1308 ctypes.wintypes.DWORD, # dwReserved
1309 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1310 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1311 ctypes.POINTER(OVERLAPPED) # Overlapped
1313 UnlockFileEx.restype = ctypes.wintypes.BOOL
1314 whole_low = 0xffffffff
1315 whole_high = 0x7fffffff
1317 def _lock_file(f, exclusive):
1318 overlapped = OVERLAPPED()
1319 overlapped.Offset = 0
1320 overlapped.OffsetHigh = 0
1321 overlapped.hEvent = 0
1322 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1323 handle = msvcrt.get_osfhandle(f.fileno())
1324 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1325 whole_low, whole_high, f._lock_file_overlapped_p):
1326 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1328 def _unlock_file(f):
1329 assert f._lock_file_overlapped_p
1330 handle = msvcrt.get_osfhandle(f.fileno())
1331 if not UnlockFileEx(handle, 0,
1332 whole_low, whole_high, f._lock_file_overlapped_p):
1333 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1336 # Some platforms, such as Jython, is missing fcntl
1340 def _lock_file(f, exclusive):
1341 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1343 def _unlock_file(f):
1344 fcntl.flock(f, fcntl.LOCK_UN)
1346 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1348 def _lock_file(f, exclusive):
1349 raise IOError(UNSUPPORTED_MSG)
1351 def _unlock_file(f):
1352 raise IOError(UNSUPPORTED_MSG)
1355 class locked_file(object):
1356 def __init__(self, filename, mode, encoding=None):
1357 assert mode in ['r', 'a', 'w']
1358 self.f = io.open(filename, mode, encoding=encoding)
1361 def __enter__(self):
1362 exclusive = self.mode != 'r'
1364 _lock_file(self.f, exclusive)
1370 def __exit__(self, etype, value, traceback):
1372 _unlock_file(self.f)
1379 def write(self, *args):
1380 return self.f.write(*args)
1382 def read(self, *args):
1383 return self.f.read(*args)
1386 def get_filesystem_encoding():
1387 encoding = sys.getfilesystemencoding()
1388 return encoding if encoding is not None else 'utf-8'
1391 def shell_quote(args):
1393 encoding = get_filesystem_encoding()
1395 if isinstance(a, bytes):
1396 # We may get a filename encoded with 'encodeFilename'
1397 a = a.decode(encoding)
1398 quoted_args.append(pipes.quote(a))
1399 return ' '.join(quoted_args)
1402 def smuggle_url(url, data):
1403 """ Pass additional data in a URL for internal use. """
1405 sdata = compat_urllib_parse_urlencode(
1406 {'__youtubedl_smuggle': json.dumps(data)})
1407 return url + '#' + sdata
1410 def unsmuggle_url(smug_url, default=None):
1411 if '#__youtubedl_smuggle' not in smug_url:
1412 return smug_url, default
1413 url, _, sdata = smug_url.rpartition('#')
1414 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1415 data = json.loads(jsond)
1419 def format_bytes(bytes):
1422 if type(bytes) is str:
1423 bytes = float(bytes)
1427 exponent = int(math.log(bytes, 1024.0))
1428 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1429 converted = float(bytes) / float(1024 ** exponent)
1430 return '%.2f%s' % (converted, suffix)
1433 def lookup_unit_table(unit_table, s):
1434 units_re = '|'.join(re.escape(u) for u in unit_table)
1436 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1439 num_str = m.group('num').replace(',', '.')
1440 mult = unit_table[m.group('unit')]
1441 return int(float(num_str) * mult)
1444 def parse_filesize(s):
1448 # The lower-case forms are of course incorrect and unofficial,
1449 # but we support those too
1487 return lookup_unit_table(_UNIT_TABLE, s)
1496 if re.match(r'^[\d,.]+$', s):
1497 return str_to_int(s)
1508 return lookup_unit_table(_UNIT_TABLE, s)
1511 def month_by_name(name):
1512 """ Return the number of a month by (locale-independently) English name """
1515 return ENGLISH_MONTH_NAMES.index(name) + 1
1520 def month_by_abbreviation(abbrev):
1521 """ Return the number of a month by (locale-independently) English
1525 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1530 def fix_xml_ampersands(xml_str):
1531 """Replace all the '&' by '&' in XML"""
1533 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1538 def setproctitle(title):
1539 assert isinstance(title, compat_str)
1541 # ctypes in Jython is not complete
1542 # http://bugs.jython.org/issue2148
1543 if sys.platform.startswith('java'):
1547 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1550 title_bytes = title.encode('utf-8')
1551 buf = ctypes.create_string_buffer(len(title_bytes))
1552 buf.value = title_bytes
1554 libc.prctl(15, buf, 0, 0, 0)
1555 except AttributeError:
1556 return # Strange libc, just skip this
1559 def remove_start(s, start):
1560 return s[len(start):] if s is not None and s.startswith(start) else s
1563 def remove_end(s, end):
1564 return s[:-len(end)] if s is not None and s.endswith(end) else s
1567 def remove_quotes(s):
1568 if s is None or len(s) < 2:
1570 for quote in ('"', "'", ):
1571 if s[0] == quote and s[-1] == quote:
1576 def url_basename(url):
1577 path = compat_urlparse.urlparse(url).path
1578 return path.strip('/').split('/')[-1]
1581 class HEADRequest(compat_urllib_request.Request):
1582 def get_method(self):
1586 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1589 v = getattr(v, get_attr, None)
1595 return int(v) * invscale // scale
1600 def str_or_none(v, default=None):
1601 return default if v is None else compat_str(v)
1604 def str_to_int(int_str):
1605 """ A more relaxed version of int_or_none """
1608 int_str = re.sub(r'[,\.\+]', '', int_str)
1612 def float_or_none(v, scale=1, invscale=1, default=None):
1616 return float(v) * invscale / scale
1621 def parse_duration(s):
1622 if not isinstance(s, compat_basestring):
1627 days, hours, mins, secs, ms = [None] * 5
1628 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1630 days, hours, mins, secs, ms = m.groups()
1635 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1638 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1641 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1644 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1647 days, hours, mins, secs, ms = m.groups()
1649 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1651 hours, mins = m.groups()
1657 duration += float(secs)
1659 duration += float(mins) * 60
1661 duration += float(hours) * 60 * 60
1663 duration += float(days) * 24 * 60 * 60
1665 duration += float(ms)
1669 def prepend_extension(filename, ext, expected_real_ext=None):
1670 name, real_ext = os.path.splitext(filename)
1672 '{0}.{1}{2}'.format(name, ext, real_ext)
1673 if not expected_real_ext or real_ext[1:] == expected_real_ext
1674 else '{0}.{1}'.format(filename, ext))
1677 def replace_extension(filename, ext, expected_real_ext=None):
1678 name, real_ext = os.path.splitext(filename)
1679 return '{0}.{1}'.format(
1680 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1684 def check_executable(exe, args=[]):
1685 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1686 args can be a list of arguments for a short output (like -version) """
1688 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1694 def get_exe_version(exe, args=['--version'],
1695 version_re=None, unrecognized='present'):
1696 """ Returns the version of the specified executable,
1697 or False if the executable is not present """
1699 out, _ = subprocess.Popen(
1700 [encodeArgument(exe)] + args,
1701 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1704 if isinstance(out, bytes): # Python 2.x
1705 out = out.decode('ascii', 'ignore')
1706 return detect_exe_version(out, version_re, unrecognized)
1709 def detect_exe_version(output, version_re=None, unrecognized='present'):
1710 assert isinstance(output, compat_str)
1711 if version_re is None:
1712 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1713 m = re.search(version_re, output)
1720 class PagedList(object):
1722 # This is only useful for tests
1723 return len(self.getslice())
1726 class OnDemandPagedList(PagedList):
1727 def __init__(self, pagefunc, pagesize, use_cache=False):
1728 self._pagefunc = pagefunc
1729 self._pagesize = pagesize
1730 self._use_cache = use_cache
1734 def getslice(self, start=0, end=None):
1736 for pagenum in itertools.count(start // self._pagesize):
1737 firstid = pagenum * self._pagesize
1738 nextfirstid = pagenum * self._pagesize + self._pagesize
1739 if start >= nextfirstid:
1744 page_results = self._cache.get(pagenum)
1745 if page_results is None:
1746 page_results = list(self._pagefunc(pagenum))
1748 self._cache[pagenum] = page_results
1751 start % self._pagesize
1752 if firstid <= start < nextfirstid
1756 ((end - 1) % self._pagesize) + 1
1757 if (end is not None and firstid <= end <= nextfirstid)
1760 if startv != 0 or endv is not None:
1761 page_results = page_results[startv:endv]
1762 res.extend(page_results)
1764 # A little optimization - if current page is not "full", ie. does
1765 # not contain page_size videos then we can assume that this page
1766 # is the last one - there are no more ids on further pages -
1767 # i.e. no need to query again.
1768 if len(page_results) + startv < self._pagesize:
1771 # If we got the whole page, but the next page is not interesting,
1772 # break out early as well
1773 if end == nextfirstid:
1778 class InAdvancePagedList(PagedList):
1779 def __init__(self, pagefunc, pagecount, pagesize):
1780 self._pagefunc = pagefunc
1781 self._pagecount = pagecount
1782 self._pagesize = pagesize
1784 def getslice(self, start=0, end=None):
1786 start_page = start // self._pagesize
1788 self._pagecount if end is None else (end // self._pagesize + 1))
1789 skip_elems = start - start_page * self._pagesize
1790 only_more = None if end is None else end - start
1791 for pagenum in range(start_page, end_page):
1792 page = list(self._pagefunc(pagenum))
1794 page = page[skip_elems:]
1796 if only_more is not None:
1797 if len(page) < only_more:
1798 only_more -= len(page)
1800 page = page[:only_more]
1807 def uppercase_escape(s):
1808 unicode_escape = codecs.getdecoder('unicode_escape')
1810 r'\\U[0-9a-fA-F]{8}',
1811 lambda m: unicode_escape(m.group(0))[0],
1815 def lowercase_escape(s):
1816 unicode_escape = codecs.getdecoder('unicode_escape')
1818 r'\\u[0-9a-fA-F]{4}',
1819 lambda m: unicode_escape(m.group(0))[0],
1823 def escape_rfc3986(s):
1824 """Escape non-ASCII characters as suggested by RFC 3986"""
1825 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1826 s = s.encode('utf-8')
1827 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1830 def escape_url(url):
1831 """Escape URL as suggested by RFC 3986"""
1832 url_parsed = compat_urllib_parse_urlparse(url)
1833 return url_parsed._replace(
1834 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1835 path=escape_rfc3986(url_parsed.path),
1836 params=escape_rfc3986(url_parsed.params),
1837 query=escape_rfc3986(url_parsed.query),
1838 fragment=escape_rfc3986(url_parsed.fragment)
1842 def read_batch_urls(batch_fd):
1844 if not isinstance(url, compat_str):
1845 url = url.decode('utf-8', 'replace')
1846 BOM_UTF8 = '\xef\xbb\xbf'
1847 if url.startswith(BOM_UTF8):
1848 url = url[len(BOM_UTF8):]
1850 if url.startswith(('#', ';', ']')):
1854 with contextlib.closing(batch_fd) as fd:
1855 return [url for url in map(fixup, fd) if url]
1858 def urlencode_postdata(*args, **kargs):
1859 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1862 def update_url_query(url, query):
1865 parsed_url = compat_urlparse.urlparse(url)
1866 qs = compat_parse_qs(parsed_url.query)
1868 return compat_urlparse.urlunparse(parsed_url._replace(
1869 query=compat_urllib_parse_urlencode(qs, True)))
1872 def update_Request(req, url=None, data=None, headers={}, query={}):
1873 req_headers = req.headers.copy()
1874 req_headers.update(headers)
1875 req_data = data or req.data
1876 req_url = update_url_query(url or req.get_full_url(), query)
1877 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1879 req_url, data=req_data, headers=req_headers,
1880 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1881 if hasattr(req, 'timeout'):
1882 new_req.timeout = req.timeout
1886 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1887 if isinstance(key_or_keys, (list, tuple)):
1888 for key in key_or_keys:
1889 if key not in d or d[key] is None or skip_false_values and not d[key]:
1893 return d.get(key_or_keys, default)
1896 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1897 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1909 def parse_age_limit(s):
1912 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1913 return int(m.group('age')) if m else US_RATINGS.get(s)
1916 def strip_jsonp(code):
1918 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1921 def js_to_json(code):
1924 if v in ('true', 'false', 'null'):
1926 elif v.startswith('/*') or v == ',':
1929 if v[0] in ("'", '"'):
1930 v = re.sub(r'(?s)\\.|"', lambda m: {
1935 }.get(m.group(0), m.group(0)), v[1:-1])
1938 (r'^0[xX][0-9a-fA-F]+', 16),
1942 for regex, base in INTEGER_TABLE:
1943 im = re.match(regex, v)
1945 i = int(im.group(0), base)
1946 return '"%d":' % i if v.endswith(':') else '%d' % i
1950 return re.sub(r'''(?sx)
1951 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1952 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1953 /\*.*?\*/|,(?=\s*[\]}])|
1954 [a-zA-Z_][.a-zA-Z_0-9]*|
1955 (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
1960 def qualities(quality_ids):
1961 """ Get a numeric quality value out of a list of possible values """
1964 return quality_ids.index(qid)
1970 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1973 def limit_length(s, length):
1974 """ Add ellipses to overly long strings """
1979 return s[:length - len(ELLIPSES)] + ELLIPSES
1983 def version_tuple(v):
1984 return tuple(int(e) for e in re.split(r'[-.]', v))
1987 def is_outdated_version(version, limit, assume_new=True):
1989 return not assume_new
1991 return version_tuple(version) < version_tuple(limit)
1993 return not assume_new
1996 def ytdl_is_updateable():
1997 """ Returns if youtube-dl can be updated with -U """
1998 from zipimport import zipimporter
2000 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2003 def args_to_str(args):
2004 # Get a short string representation for a subprocess command
2005 return ' '.join(compat_shlex_quote(a) for a in args)
2008 def error_to_compat_str(err):
2010 # On python 2 error byte string must be decoded with proper
2011 # encoding rather than ascii
2012 if sys.version_info[0] < 3:
2013 err_str = err_str.decode(preferredencoding())
2017 def mimetype2ext(mt):
2027 _, _, res = mt.rpartition('/')
2031 'smptett+xml': 'tt',
2037 'x-mp4-fragmented': 'mp4',
2042 def urlhandle_detect_ext(url_handle):
2043 getheader = url_handle.headers.get
2045 cd = getheader('Content-Disposition')
2047 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2049 e = determine_ext(m.group('filename'), default_ext=None)
2053 return mimetype2ext(getheader('Content-Type'))
2056 def encode_data_uri(data, mime_type):
2057 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2060 def age_restricted(content_limit, age_limit):
2061 """ Returns True iff the content should be blocked """
2063 if age_limit is None: # No limit set
2065 if content_limit is None:
2066 return False # Content available for everyone
2067 return age_limit < content_limit
2070 def is_html(first_bytes):
2071 """ Detect whether a file contains HTML by examining its first bytes. """
2074 (b'\xef\xbb\xbf', 'utf-8'),
2075 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2076 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2077 (b'\xff\xfe', 'utf-16-le'),
2078 (b'\xfe\xff', 'utf-16-be'),
2080 for bom, enc in BOMS:
2081 if first_bytes.startswith(bom):
2082 s = first_bytes[len(bom):].decode(enc, 'replace')
2085 s = first_bytes.decode('utf-8', 'replace')
2087 return re.match(r'^\s*<', s)
2090 def determine_protocol(info_dict):
2091 protocol = info_dict.get('protocol')
2092 if protocol is not None:
2095 url = info_dict['url']
2096 if url.startswith('rtmp'):
2098 elif url.startswith('mms'):
2100 elif url.startswith('rtsp'):
2103 ext = determine_ext(url)
2109 return compat_urllib_parse_urlparse(url).scheme
2112 def render_table(header_row, data):
2113 """ Render a list of rows, each as a list of values """
2114 table = [header_row] + data
2115 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2116 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2117 return '\n'.join(format_str % tuple(row) for row in table)
2120 def _match_one(filter_part, dct):
2121 COMPARISON_OPERATORS = {
2129 operator_rex = re.compile(r'''(?x)\s*
2131 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2133 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2134 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2137 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2138 m = operator_rex.search(filter_part)
2140 op = COMPARISON_OPERATORS[m.group('op')]
2141 if m.group('strval') is not None:
2142 if m.group('op') not in ('=', '!='):
2144 'Operator %s does not support string values!' % m.group('op'))
2145 comparison_value = m.group('strval')
2148 comparison_value = int(m.group('intval'))
2150 comparison_value = parse_filesize(m.group('intval'))
2151 if comparison_value is None:
2152 comparison_value = parse_filesize(m.group('intval') + 'B')
2153 if comparison_value is None:
2155 'Invalid integer value %r in filter part %r' % (
2156 m.group('intval'), filter_part))
2157 actual_value = dct.get(m.group('key'))
2158 if actual_value is None:
2159 return m.group('none_inclusive')
2160 return op(actual_value, comparison_value)
2163 '': lambda v: v is not None,
2164 '!': lambda v: v is None,
2166 operator_rex = re.compile(r'''(?x)\s*
2167 (?P<op>%s)\s*(?P<key>[a-z_]+)
2169 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2170 m = operator_rex.search(filter_part)
2172 op = UNARY_OPERATORS[m.group('op')]
2173 actual_value = dct.get(m.group('key'))
2174 return op(actual_value)
2176 raise ValueError('Invalid filter part %r' % filter_part)
2179 def match_str(filter_str, dct):
2180 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2183 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2186 def match_filter_func(filter_str):
2187 def _match_func(info_dict):
2188 if match_str(filter_str, info_dict):
2191 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2192 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2196 def parse_dfxp_time_expr(time_expr):
2200 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2202 return float(mobj.group('time_offset'))
2204 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2206 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2209 def srt_subtitles_timecode(seconds):
2210 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2213 def dfxp2srt(dfxp_data):
2214 _x = functools.partial(xpath_with_ns, ns_map={
2215 'ttml': 'http://www.w3.org/ns/ttml',
2216 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2217 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2220 class TTMLPElementParser(object):
2223 def start(self, tag, attrib):
2224 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2230 def data(self, data):
2234 return self.out.strip()
2236 def parse_node(node):
2237 target = TTMLPElementParser()
2238 parser = xml.etree.ElementTree.XMLParser(target=target)
2239 parser.feed(xml.etree.ElementTree.tostring(node))
2240 return parser.close()
2242 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2244 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2247 raise ValueError('Invalid dfxp/TTML subtitle')
2249 for para, index in zip(paras, itertools.count(1)):
2250 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2251 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2252 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2253 if begin_time is None:
2258 end_time = begin_time + dur
2259 out.append('%d\n%s --> %s\n%s\n\n' % (
2261 srt_subtitles_timecode(begin_time),
2262 srt_subtitles_timecode(end_time),
2268 def cli_option(params, command_option, param):
2269 param = params.get(param)
2270 return [command_option, param] if param is not None else []
2273 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2274 param = params.get(param)
2275 assert isinstance(param, bool)
2277 return [command_option + separator + (true_value if param else false_value)]
2278 return [command_option, true_value if param else false_value]
2281 def cli_valueless_option(params, command_option, param, expected_value=True):
2282 param = params.get(param)
2283 return [command_option] if param == expected_value else []
2286 def cli_configuration_args(params, param, default=[]):
2287 ex_args = params.get(param)
2290 assert isinstance(ex_args, list)
2294 class ISO639Utils(object):
2295 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2484 def short2long(cls, code):
2485 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2486 return cls._lang_map.get(code[:2])
2489 def long2short(cls, code):
2490 """Convert language code from ISO 639-2/T to ISO 639-1"""
2491 for short_name, long_name in cls._lang_map.items():
2492 if long_name == code:
2496 class ISO3166Utils(object):
2497 # From http://data.okfn.org/data/core/country-list
2499 'AF': 'Afghanistan',
2500 'AX': 'Åland Islands',
2503 'AS': 'American Samoa',
2508 'AG': 'Antigua and Barbuda',
2525 'BO': 'Bolivia, Plurinational State of',
2526 'BQ': 'Bonaire, Sint Eustatius and Saba',
2527 'BA': 'Bosnia and Herzegovina',
2529 'BV': 'Bouvet Island',
2531 'IO': 'British Indian Ocean Territory',
2532 'BN': 'Brunei Darussalam',
2534 'BF': 'Burkina Faso',
2540 'KY': 'Cayman Islands',
2541 'CF': 'Central African Republic',
2545 'CX': 'Christmas Island',
2546 'CC': 'Cocos (Keeling) Islands',
2550 'CD': 'Congo, the Democratic Republic of the',
2551 'CK': 'Cook Islands',
2553 'CI': 'Côte d\'Ivoire',
2558 'CZ': 'Czech Republic',
2562 'DO': 'Dominican Republic',
2565 'SV': 'El Salvador',
2566 'GQ': 'Equatorial Guinea',
2570 'FK': 'Falkland Islands (Malvinas)',
2571 'FO': 'Faroe Islands',
2575 'GF': 'French Guiana',
2576 'PF': 'French Polynesia',
2577 'TF': 'French Southern Territories',
2592 'GW': 'Guinea-Bissau',
2595 'HM': 'Heard Island and McDonald Islands',
2596 'VA': 'Holy See (Vatican City State)',
2603 'IR': 'Iran, Islamic Republic of',
2606 'IM': 'Isle of Man',
2616 'KP': 'Korea, Democratic People\'s Republic of',
2617 'KR': 'Korea, Republic of',
2620 'LA': 'Lao People\'s Democratic Republic',
2626 'LI': 'Liechtenstein',
2630 'MK': 'Macedonia, the Former Yugoslav Republic of',
2637 'MH': 'Marshall Islands',
2643 'FM': 'Micronesia, Federated States of',
2644 'MD': 'Moldova, Republic of',
2655 'NL': 'Netherlands',
2656 'NC': 'New Caledonia',
2657 'NZ': 'New Zealand',
2662 'NF': 'Norfolk Island',
2663 'MP': 'Northern Mariana Islands',
2668 'PS': 'Palestine, State of',
2670 'PG': 'Papua New Guinea',
2673 'PH': 'Philippines',
2677 'PR': 'Puerto Rico',
2681 'RU': 'Russian Federation',
2683 'BL': 'Saint Barthélemy',
2684 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2685 'KN': 'Saint Kitts and Nevis',
2686 'LC': 'Saint Lucia',
2687 'MF': 'Saint Martin (French part)',
2688 'PM': 'Saint Pierre and Miquelon',
2689 'VC': 'Saint Vincent and the Grenadines',
2692 'ST': 'Sao Tome and Principe',
2693 'SA': 'Saudi Arabia',
2697 'SL': 'Sierra Leone',
2699 'SX': 'Sint Maarten (Dutch part)',
2702 'SB': 'Solomon Islands',
2704 'ZA': 'South Africa',
2705 'GS': 'South Georgia and the South Sandwich Islands',
2706 'SS': 'South Sudan',
2711 'SJ': 'Svalbard and Jan Mayen',
2714 'CH': 'Switzerland',
2715 'SY': 'Syrian Arab Republic',
2716 'TW': 'Taiwan, Province of China',
2718 'TZ': 'Tanzania, United Republic of',
2720 'TL': 'Timor-Leste',
2724 'TT': 'Trinidad and Tobago',
2727 'TM': 'Turkmenistan',
2728 'TC': 'Turks and Caicos Islands',
2732 'AE': 'United Arab Emirates',
2733 'GB': 'United Kingdom',
2734 'US': 'United States',
2735 'UM': 'United States Minor Outlying Islands',
2739 'VE': 'Venezuela, Bolivarian Republic of',
2741 'VG': 'Virgin Islands, British',
2742 'VI': 'Virgin Islands, U.S.',
2743 'WF': 'Wallis and Futuna',
2744 'EH': 'Western Sahara',
2751 def short2full(cls, code):
2752 """Convert an ISO 3166-2 country code to the corresponding full name"""
2753 return cls._country_map.get(code.upper())
2756 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2757 def __init__(self, proxies=None):
2758 # Set default handlers
2759 for type in ('http', 'https'):
2760 setattr(self, '%s_open' % type,
2761 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2762 meth(r, proxy, type))
2763 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2765 def proxy_open(self, req, proxy, type):
2766 req_proxy = req.headers.get('Ytdl-request-proxy')
2767 if req_proxy is not None:
2769 del req.headers['Ytdl-request-proxy']
2771 if proxy == '__noproxy__':
2772 return None # No Proxy
2773 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2774 req.add_header('Ytdl-socks-proxy', proxy)
2775 # youtube-dl's http/https handlers do wrapping the socket with socks
2777 return compat_urllib_request.ProxyHandler.proxy_open(
2778 self, req, proxy, type)
2781 def ohdave_rsa_encrypt(data, exponent, modulus):
2783 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2786 data: data to encrypt, bytes-like object
2787 exponent, modulus: parameter e and N of RSA algorithm, both integer
2788 Output: hex string of encrypted data
2790 Limitation: supports one block encryption only
2793 payload = int(binascii.hexlify(data[::-1]), 16)
2794 encrypted = pow(payload, exponent, modulus)
2795 return '%x' % encrypted
2798 def encode_base_n(num, n, table=None):
2799 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2801 table = FULL_TABLE[:n]
2804 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2811 ret = table[num % n] + ret
2816 def decode_packed_codes(code):
2818 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2820 obfucasted_code, base, count, symbols = mobj.groups()
2823 symbols = symbols.split('|')
2828 base_n_count = encode_base_n(count, base)
2829 symbol_table[base_n_count] = symbols[count] or base_n_count
2832 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],