2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
46 compat_socket_create_connection,
51 compat_urllib_parse_urlencode,
52 compat_urllib_parse_urlparse,
53 compat_urllib_parse_unquote_plus,
54 compat_urllib_request,
65 def register_socks_protocols():
66 # "Register" SOCKS protocols
67 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
68 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
69 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
70 if scheme not in compat_urlparse.uses_netloc:
71 compat_urlparse.uses_netloc.append(scheme)
74 # This is not clearly defined otherwise
75 compiled_regex_type = type(re.compile(''))
78 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
79 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
80 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
81 'Accept-Encoding': 'gzip, deflate',
82 'Accept-Language': 'en-us,en;q=0.5',
88 ENGLISH_MONTH_NAMES = [
89 'January', 'February', 'March', 'April', 'May', 'June',
90 'July', 'August', 'September', 'October', 'November', 'December']
93 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
94 'flv', 'f4v', 'f4a', 'f4b',
95 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
105 'f4f', 'f4m', 'm3u8', 'smil')
107 # needed for sanitizing filenames in restricted mode
108 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ',
109 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
110 'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
113 def preferredencoding():
114 """Get preferred encoding.
116 Returns the best encoding scheme for the system, based on
117 locale.getpreferredencoding() and some further tweaks.
120 pref = locale.getpreferredencoding()
128 def write_json_file(obj, fn):
129 """ Encode obj as JSON and write it to fn, atomically if possible """
131 fn = encodeFilename(fn)
132 if sys.version_info < (3, 0) and sys.platform != 'win32':
133 encoding = get_filesystem_encoding()
134 # os.path.basename returns a bytes object, but NamedTemporaryFile
135 # will fail if the filename contains non ascii characters unless we
136 # use a unicode object
137 path_basename = lambda f: os.path.basename(fn).decode(encoding)
138 # the same for os.path.dirname
139 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
141 path_basename = os.path.basename
142 path_dirname = os.path.dirname
146 'prefix': path_basename(fn) + '.',
147 'dir': path_dirname(fn),
151 # In Python 2.x, json.dump expects a bytestream.
152 # In Python 3.x, it writes to a character stream
153 if sys.version_info < (3, 0):
161 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
166 if sys.platform == 'win32':
167 # Need to remove existing file on Windows, else os.rename raises
168 # WindowsError or FileExistsError.
173 os.rename(tf.name, fn)
182 if sys.version_info >= (2, 7):
183 def find_xpath_attr(node, xpath, key, val=None):
184 """ Find the xpath xpath[@key=val] """
185 assert re.match(r'^[a-zA-Z_-]+$', key)
186 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
187 return node.find(expr)
189 def find_xpath_attr(node, xpath, key, val=None):
190 for f in node.findall(compat_xpath(xpath)):
191 if key not in f.attrib:
193 if val is None or f.attrib.get(key) == val:
197 # On python2.6 the xml.etree.ElementTree.Element methods don't support
198 # the namespace parameter
201 def xpath_with_ns(path, ns_map):
202 components = [c.split(':') for c in path.split('/')]
206 replaced.append(c[0])
209 replaced.append('{%s}%s' % (ns_map[ns], tag))
210 return '/'.join(replaced)
213 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
214 def _find_xpath(xpath):
215 return node.find(compat_xpath(xpath))
217 if isinstance(xpath, (str, compat_str)):
218 n = _find_xpath(xpath)
226 if default is not NO_DEFAULT:
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element %s' % name)
236 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
237 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
238 if n is None or n == default:
241 if default is not NO_DEFAULT:
244 name = xpath if name is None else name
245 raise ExtractorError('Could not find XML element\'s text %s' % name)
251 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
252 n = find_xpath_attr(node, xpath, key)
254 if default is not NO_DEFAULT:
257 name = '%s[@%s]' % (xpath, key) if name is None else name
258 raise ExtractorError('Could not find XML attribute %s' % name)
264 def get_element_by_id(id, html):
265 """Return the content of the tag with the specified ID in the passed HTML document"""
266 return get_element_by_attribute('id', id, html)
269 def get_element_by_attribute(attribute, value, html):
270 """Return the content of the tag with the specified attribute in the passed HTML document"""
272 m = re.search(r'''(?xs)
274 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
276 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
280 ''' % (re.escape(attribute), re.escape(value)), html)
284 res = m.group('content')
286 if res.startswith('"') or res.startswith("'"):
289 return unescapeHTML(res)
292 class HTMLAttributeParser(compat_HTMLParser):
293 """Trivial HTML parser to gather the attributes for a single element"""
296 compat_HTMLParser.__init__(self)
298 def handle_starttag(self, tag, attrs):
299 self.attrs = dict(attrs)
302 def extract_attributes(html_element):
303 """Given a string for an HTML element such as
305 a="foo" B="bar" c="&98;az" d=boz
306 empty= noval entity="&"
309 Decode and return a dictionary of attributes.
311 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
312 'empty': '', 'noval': None, 'entity': '&',
313 'sq': '"', 'dq': '\''
315 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
316 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
318 parser = HTMLAttributeParser()
319 parser.feed(html_element)
324 def clean_html(html):
325 """Clean an HTML snippet into a readable string"""
327 if html is None: # Convenience for sanitizing descriptions etc.
331 html = html.replace('\n', ' ')
332 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
333 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
335 html = re.sub('<.*?>', '', html)
336 # Replace html entities
337 html = unescapeHTML(html)
341 def sanitize_open(filename, open_mode):
342 """Try to open the given filename, and slightly tweak it if this fails.
344 Attempts to open the given filename. If this fails, it tries to change
345 the filename slightly, step by step, until it's either able to open it
346 or it fails and raises a final exception, like the standard open()
349 It returns the tuple (stream, definitive_file_name).
353 if sys.platform == 'win32':
355 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
356 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, filename)
359 except (IOError, OSError) as err:
360 if err.errno in (errno.EACCES,):
363 # In case of error, try to remove win32 forbidden chars
364 alt_filename = sanitize_path(filename)
365 if alt_filename == filename:
368 # An exception here should be caught in the caller
369 stream = open(encodeFilename(alt_filename), open_mode)
370 return (stream, alt_filename)
373 def timeconvert(timestr):
374 """Convert RFC 2822 defined time string into system timestamp"""
376 timetuple = email.utils.parsedate_tz(timestr)
377 if timetuple is not None:
378 timestamp = email.utils.mktime_tz(timetuple)
382 def sanitize_filename(s, restricted=False, is_id=False):
383 """Sanitizes a string so it could be used as part of a filename.
384 If restricted is set, use a stricter subset of allowed characters.
385 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
387 def replace_insane(char):
388 if restricted and char in ACCENT_CHARS:
389 return ACCENT_CHARS[char]
390 if char == '?' or ord(char) < 32 or ord(char) == 127:
393 return '' if restricted else '\''
395 return '_-' if restricted else ' -'
396 elif char in '\\/|*<>':
398 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
400 if restricted and ord(char) > 127:
405 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
406 result = ''.join(map(replace_insane, s))
408 while '__' in result:
409 result = result.replace('__', '_')
410 result = result.strip('_')
411 # Common case of "Foreign band name - English song title"
412 if restricted and result.startswith('-_'):
414 if result.startswith('-'):
415 result = '_' + result[len('-'):]
416 result = result.lstrip('.')
422 def sanitize_path(s):
423 """Sanitizes and normalizes path on Windows"""
424 if sys.platform != 'win32':
426 drive_or_unc, _ = os.path.splitdrive(s)
427 if sys.version_info < (2, 7) and not drive_or_unc:
428 drive_or_unc, _ = os.path.splitunc(s)
429 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
433 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
434 for path_part in norm_path]
436 sanitized_path.insert(0, drive_or_unc + os.path.sep)
437 return os.path.join(*sanitized_path)
440 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
441 # unwanted failures due to missing protocol
442 def sanitize_url(url):
443 return 'http:%s' % url if url.startswith('//') else url
446 def sanitized_Request(url, *args, **kwargs):
447 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
450 def orderedSet(iterable):
451 """ Remove all duplicates from the input iterable """
459 def _htmlentity_transform(entity):
460 """Transforms an HTML entity to a character."""
461 # Known non-numeric HTML entity
462 if entity in compat_html_entities.name2codepoint:
463 return compat_chr(compat_html_entities.name2codepoint[entity])
465 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
467 numstr = mobj.group(1)
468 if numstr.startswith('x'):
470 numstr = '0%s' % numstr
473 # See https://github.com/rg3/youtube-dl/issues/7518
475 return compat_chr(int(numstr, base))
479 # Unknown entity in name, return its literal representation
480 return '&%s;' % entity
486 assert type(s) == compat_str
489 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
492 def get_subprocess_encoding():
493 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
494 # For subprocess calls, encode with locale encoding
495 # Refer to http://stackoverflow.com/a/9951851/35070
496 encoding = preferredencoding()
498 encoding = sys.getfilesystemencoding()
504 def encodeFilename(s, for_subprocess=False):
506 @param s The name of the file
509 assert type(s) == compat_str
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
515 # Pass '' directly to use Unicode APIs on Windows 2000 and up
516 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
517 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
518 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
521 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
522 if sys.platform.startswith('java'):
525 return s.encode(get_subprocess_encoding(), 'ignore')
528 def decodeFilename(b, for_subprocess=False):
530 if sys.version_info >= (3, 0):
533 if not isinstance(b, bytes):
536 return b.decode(get_subprocess_encoding(), 'ignore')
539 def encodeArgument(s):
540 if not isinstance(s, compat_str):
541 # Legacy code that uses byte strings
542 # Uncomment the following line after fixing all post processors
543 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
544 s = s.decode('ascii')
545 return encodeFilename(s, True)
548 def decodeArgument(b):
549 return decodeFilename(b, True)
552 def decodeOption(optval):
555 if isinstance(optval, bytes):
556 optval = optval.decode(preferredencoding())
558 assert isinstance(optval, compat_str)
562 def formatSeconds(secs):
564 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
566 return '%d:%02d' % (secs // 60, secs % 60)
571 def make_HTTPS_handler(params, **kwargs):
572 opts_no_check_certificate = params.get('nocheckcertificate', False)
573 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
574 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
575 if opts_no_check_certificate:
576 context.check_hostname = False
577 context.verify_mode = ssl.CERT_NONE
579 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
582 # (create_default_context present but HTTPSHandler has no context=)
585 if sys.version_info < (3, 2):
586 return YoutubeDLHTTPSHandler(params, **kwargs)
588 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
589 context.verify_mode = (ssl.CERT_NONE
590 if opts_no_check_certificate
591 else ssl.CERT_REQUIRED)
592 context.set_default_verify_paths()
593 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
596 def bug_reports_message():
597 if ytdl_is_updateable():
598 update_cmd = 'type youtube-dl -U to update'
600 update_cmd = 'see https://yt-dl.org/update on how to update'
601 msg = '; please report this issue on https://yt-dl.org/bug .'
602 msg += ' Make sure you are using the latest version; %s.' % update_cmd
603 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
607 class ExtractorError(Exception):
608 """Error during info extraction."""
610 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
611 """ tb, if given, is the original traceback (so that it can be printed out).
612 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
615 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
617 if video_id is not None:
618 msg = video_id + ': ' + msg
620 msg += ' (caused by %r)' % cause
622 msg += bug_reports_message()
623 super(ExtractorError, self).__init__(msg)
626 self.exc_info = sys.exc_info() # preserve original exception
628 self.video_id = video_id
630 def format_traceback(self):
631 if self.traceback is None:
633 return ''.join(traceback.format_tb(self.traceback))
636 class UnsupportedError(ExtractorError):
637 def __init__(self, url):
638 super(UnsupportedError, self).__init__(
639 'Unsupported URL: %s' % url, expected=True)
643 class RegexNotFoundError(ExtractorError):
644 """Error when a regex didn't match"""
648 class DownloadError(Exception):
649 """Download Error exception.
651 This exception may be thrown by FileDownloader objects if they are not
652 configured to continue on errors. They will contain the appropriate
656 def __init__(self, msg, exc_info=None):
657 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
658 super(DownloadError, self).__init__(msg)
659 self.exc_info = exc_info
662 class SameFileError(Exception):
663 """Same File exception.
665 This exception will be thrown by FileDownloader objects if they detect
666 multiple files would have to be downloaded to the same file on disk.
671 class PostProcessingError(Exception):
672 """Post Processing exception.
674 This exception may be raised by PostProcessor's .run() method to
675 indicate an error in the postprocessing task.
678 def __init__(self, msg):
682 class MaxDownloadsReached(Exception):
683 """ --max-downloads limit has been reached. """
687 class UnavailableVideoError(Exception):
688 """Unavailable Format exception.
690 This exception will be thrown when a video is requested
691 in a format that is not available for that video.
696 class ContentTooShortError(Exception):
697 """Content Too Short exception.
699 This exception may be raised by FileDownloader objects when a file they
700 download is too small for what the server announced first, indicating
701 the connection was probably interrupted.
704 def __init__(self, downloaded, expected):
706 self.downloaded = downloaded
707 self.expected = expected
710 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
711 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
712 # expected HTTP responses to meet HTTP/1.0 or later (see also
713 # https://github.com/rg3/youtube-dl/issues/6727)
714 if sys.version_info < (3, 0):
715 kwargs[b'strict'] = True
716 hc = http_class(*args, **kwargs)
717 source_address = ydl_handler._params.get('source_address')
718 if source_address is not None:
719 sa = (source_address, 0)
720 if hasattr(hc, 'source_address'): # Python 2.7+
721 hc.source_address = sa
723 def _hc_connect(self, *args, **kwargs):
724 sock = compat_socket_create_connection(
725 (self.host, self.port), self.timeout, sa)
727 self.sock = ssl.wrap_socket(
728 sock, self.key_file, self.cert_file,
729 ssl_version=ssl.PROTOCOL_TLSv1)
732 hc.connect = functools.partial(_hc_connect, hc)
737 def handle_youtubedl_headers(headers):
738 filtered_headers = headers
740 if 'Youtubedl-no-compression' in filtered_headers:
741 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
742 del filtered_headers['Youtubedl-no-compression']
744 return filtered_headers
747 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
748 """Handler for HTTP requests and responses.
750 This class, when installed with an OpenerDirector, automatically adds
751 the standard headers to every HTTP request and handles gzipped and
752 deflated responses from web servers. If compression is to be avoided in
753 a particular request, the original request in the program code only has
754 to include the HTTP header "Youtubedl-no-compression", which will be
755 removed before making the real request.
757 Part of this code was copied from:
759 http://techknack.net/python-urllib2-handlers/
761 Andrew Rowls, the author of that code, agreed to release it to the
765 def __init__(self, params, *args, **kwargs):
766 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
767 self._params = params
769 def http_open(self, req):
770 conn_class = compat_http_client.HTTPConnection
772 socks_proxy = req.headers.get('Ytdl-socks-proxy')
774 conn_class = make_socks_conn_class(conn_class, socks_proxy)
775 del req.headers['Ytdl-socks-proxy']
777 return self.do_open(functools.partial(
778 _create_http_connection, self, conn_class, False),
784 return zlib.decompress(data, -zlib.MAX_WBITS)
786 return zlib.decompress(data)
789 def addinfourl_wrapper(stream, headers, url, code):
790 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
791 return compat_urllib_request.addinfourl(stream, headers, url, code)
792 ret = compat_urllib_request.addinfourl(stream, headers, url)
796 def http_request(self, req):
797 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
798 # always respected by websites, some tend to give out URLs with non percent-encoded
799 # non-ASCII characters (see telemb.py, ard.py [#3412])
800 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
801 # To work around aforementioned issue we will replace request's original URL with
802 # percent-encoded one
803 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
804 # the code of this workaround has been moved here from YoutubeDL.urlopen()
805 url = req.get_full_url()
806 url_escaped = escape_url(url)
808 # Substitute URL if any change after escaping
809 if url != url_escaped:
810 req = update_Request(req, url=url_escaped)
812 for h, v in std_headers.items():
813 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
814 # The dict keys are capitalized because of this bug by urllib
815 if h.capitalize() not in req.headers:
818 req.headers = handle_youtubedl_headers(req.headers)
820 if sys.version_info < (2, 7) and '#' in req.get_full_url():
821 # Python 2.6 is brain-dead when it comes to fragments
822 req._Request__original = req._Request__original.partition('#')[0]
823 req._Request__r_type = req._Request__r_type.partition('#')[0]
827 def http_response(self, req, resp):
830 if resp.headers.get('Content-encoding', '') == 'gzip':
831 content = resp.read()
832 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
834 uncompressed = io.BytesIO(gz.read())
835 except IOError as original_ioerror:
836 # There may be junk add the end of the file
837 # See http://stackoverflow.com/q/4928560/35070 for details
838 for i in range(1, 1024):
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
841 uncompressed = io.BytesIO(gz.read())
846 raise original_ioerror
847 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
848 resp.msg = old_resp.msg
849 del resp.headers['Content-encoding']
851 if resp.headers.get('Content-encoding', '') == 'deflate':
852 gz = io.BytesIO(self.deflate(resp.read()))
853 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
854 resp.msg = old_resp.msg
855 del resp.headers['Content-encoding']
856 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
857 # https://github.com/rg3/youtube-dl/issues/6457).
858 if 300 <= resp.code < 400:
859 location = resp.headers.get('Location')
861 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
862 if sys.version_info >= (3, 0):
863 location = location.encode('iso-8859-1').decode('utf-8')
864 location_escaped = escape_url(location)
865 if location != location_escaped:
866 del resp.headers['Location']
867 resp.headers['Location'] = location_escaped
870 https_request = http_request
871 https_response = http_response
874 def make_socks_conn_class(base_class, socks_proxy):
875 assert issubclass(base_class, (
876 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
878 url_components = compat_urlparse.urlparse(socks_proxy)
879 if url_components.scheme.lower() == 'socks5':
880 socks_type = ProxyType.SOCKS5
881 elif url_components.scheme.lower() in ('socks', 'socks4'):
882 socks_type = ProxyType.SOCKS4
883 elif url_components.scheme.lower() == 'socks4a':
884 socks_type = ProxyType.SOCKS4A
886 def unquote_if_non_empty(s):
889 return compat_urllib_parse_unquote_plus(s)
893 url_components.hostname, url_components.port or 1080,
895 unquote_if_non_empty(url_components.username),
896 unquote_if_non_empty(url_components.password),
899 class SocksConnection(base_class):
901 self.sock = sockssocket()
902 self.sock.setproxy(*proxy_args)
903 if type(self.timeout) in (int, float):
904 self.sock.settimeout(self.timeout)
905 self.sock.connect((self.host, self.port))
907 if isinstance(self, compat_http_client.HTTPSConnection):
908 if hasattr(self, '_context'): # Python > 2.6
909 self.sock = self._context.wrap_socket(
910 self.sock, server_hostname=self.host)
912 self.sock = ssl.wrap_socket(self.sock)
914 return SocksConnection
917 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
918 def __init__(self, params, https_conn_class=None, *args, **kwargs):
919 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
920 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
921 self._params = params
923 def https_open(self, req):
925 conn_class = self._https_conn_class
927 if hasattr(self, '_context'): # python > 2.6
928 kwargs['context'] = self._context
929 if hasattr(self, '_check_hostname'): # python 3.x
930 kwargs['check_hostname'] = self._check_hostname
932 socks_proxy = req.headers.get('Ytdl-socks-proxy')
934 conn_class = make_socks_conn_class(conn_class, socks_proxy)
935 del req.headers['Ytdl-socks-proxy']
937 return self.do_open(functools.partial(
938 _create_http_connection, self, conn_class, True),
942 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
943 def __init__(self, cookiejar=None):
944 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
946 def http_response(self, request, response):
947 # Python 2 will choke on next HTTP request in row if there are non-ASCII
948 # characters in Set-Cookie HTTP header of last response (see
949 # https://github.com/rg3/youtube-dl/issues/6769).
950 # In order to at least prevent crashing we will percent encode Set-Cookie
951 # header before HTTPCookieProcessor starts processing it.
952 # if sys.version_info < (3, 0) and response.headers:
953 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
954 # set_cookie = response.headers.get(set_cookie_header)
956 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
957 # if set_cookie != set_cookie_escaped:
958 # del response.headers[set_cookie_header]
959 # response.headers[set_cookie_header] = set_cookie_escaped
960 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
962 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
963 https_response = http_response
966 def parse_iso8601(date_str, delimiter='T', timezone=None):
967 """ Return a UNIX timestamp from the given date """
972 date_str = re.sub(r'\.[0-9]+', '', date_str)
976 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
979 timezone = datetime.timedelta()
981 date_str = date_str[:-len(m.group(0))]
982 if not m.group('sign'):
983 timezone = datetime.timedelta()
985 sign = 1 if m.group('sign') == '+' else -1
986 timezone = datetime.timedelta(
987 hours=sign * int(m.group('hours')),
988 minutes=sign * int(m.group('minutes')))
990 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
991 dt = datetime.datetime.strptime(date_str, date_format) - timezone
992 return calendar.timegm(dt.timetuple())
997 def unified_strdate(date_str, day_first=True):
998 """Return a string with the date in the format YYYYMMDD"""
1000 if date_str is None:
1004 date_str = date_str.replace(',', ' ')
1005 # %z (UTC offset) is only supported in python>=3.2
1006 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1007 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1008 # Remove AM/PM + timezone
1009 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1011 format_expressions = [
1022 '%Y/%m/%d %H:%M:%S',
1023 '%Y-%m-%d %H:%M:%S',
1024 '%Y-%m-%d %H:%M:%S.%f',
1027 '%Y-%m-%dT%H:%M:%SZ',
1028 '%Y-%m-%dT%H:%M:%S.%fZ',
1029 '%Y-%m-%dT%H:%M:%S.%f0Z',
1030 '%Y-%m-%dT%H:%M:%S',
1031 '%Y-%m-%dT%H:%M:%S.%f',
1035 format_expressions.extend([
1040 '%d/%m/%Y %H:%M:%S',
1043 format_expressions.extend([
1048 '%m/%d/%Y %H:%M:%S',
1050 for expression in format_expressions:
1052 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1055 if upload_date is None:
1056 timetuple = email.utils.parsedate_tz(date_str)
1058 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1059 if upload_date is not None:
1060 return compat_str(upload_date)
1063 def determine_ext(url, default_ext='unknown_video'):
1066 guess = url.partition('?')[0].rpartition('.')[2]
1067 if re.match(r'^[A-Za-z0-9]+$', guess):
1069 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1070 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1071 return guess.rstrip('/')
1076 def subtitles_filename(filename, sub_lang, sub_format):
1077 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1080 def date_from_str(date_str):
1082 Return a datetime object from a string in the format YYYYMMDD or
1083 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1084 today = datetime.date.today()
1085 if date_str in ('now', 'today'):
1087 if date_str == 'yesterday':
1088 return today - datetime.timedelta(days=1)
1089 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1090 if match is not None:
1091 sign = match.group('sign')
1092 time = int(match.group('time'))
1095 unit = match.group('unit')
1096 # A bad approximation?
1100 elif unit == 'year':
1104 delta = datetime.timedelta(**{unit: time})
1105 return today + delta
1106 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1109 def hyphenate_date(date_str):
1111 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1112 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1113 if match is not None:
1114 return '-'.join(match.groups())
1119 class DateRange(object):
1120 """Represents a time interval between two dates"""
1122 def __init__(self, start=None, end=None):
1123 """start and end must be strings in the format accepted by date"""
1124 if start is not None:
1125 self.start = date_from_str(start)
1127 self.start = datetime.datetime.min.date()
1129 self.end = date_from_str(end)
1131 self.end = datetime.datetime.max.date()
1132 if self.start > self.end:
1133 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1137 """Returns a range that only contains the given day"""
1138 return cls(day, day)
1140 def __contains__(self, date):
1141 """Check if the date is in the range"""
1142 if not isinstance(date, datetime.date):
1143 date = date_from_str(date)
1144 return self.start <= date <= self.end
1147 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1150 def platform_name():
1151 """ Returns the platform name as a compat_str """
1152 res = platform.platform()
1153 if isinstance(res, bytes):
1154 res = res.decode(preferredencoding())
1156 assert isinstance(res, compat_str)
1160 def _windows_write_string(s, out):
1161 """ Returns True if the string was written using special methods,
1162 False if it has yet to be written out."""
1163 # Adapted from http://stackoverflow.com/a/3259271/35070
1166 import ctypes.wintypes
1174 fileno = out.fileno()
1175 except AttributeError:
1176 # If the output stream doesn't have a fileno, it's virtual
1178 except io.UnsupportedOperation:
1179 # Some strange Windows pseudo files?
1181 if fileno not in WIN_OUTPUT_IDS:
1184 GetStdHandle = ctypes.WINFUNCTYPE(
1185 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1186 (b'GetStdHandle', ctypes.windll.kernel32))
1187 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1189 WriteConsoleW = ctypes.WINFUNCTYPE(
1190 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1191 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1192 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1193 written = ctypes.wintypes.DWORD(0)
1195 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1196 FILE_TYPE_CHAR = 0x0002
1197 FILE_TYPE_REMOTE = 0x8000
1198 GetConsoleMode = ctypes.WINFUNCTYPE(
1199 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1200 ctypes.POINTER(ctypes.wintypes.DWORD))(
1201 (b'GetConsoleMode', ctypes.windll.kernel32))
1202 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1204 def not_a_console(handle):
1205 if handle == INVALID_HANDLE_VALUE or handle is None:
1207 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1208 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1210 if not_a_console(h):
1213 def next_nonbmp_pos(s):
1215 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1216 except StopIteration:
1220 count = min(next_nonbmp_pos(s), 1024)
1222 ret = WriteConsoleW(
1223 h, s, count if count else 2, ctypes.byref(written), None)
1225 raise OSError('Failed to write string')
1226 if not count: # We just wrote a non-BMP character
1227 assert written.value == 2
1230 assert written.value > 0
1231 s = s[written.value:]
1235 def write_string(s, out=None, encoding=None):
1238 assert type(s) == compat_str
1240 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1241 if _windows_write_string(s, out):
1244 if ('b' in getattr(out, 'mode', '') or
1245 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1246 byt = s.encode(encoding or preferredencoding(), 'ignore')
1248 elif hasattr(out, 'buffer'):
1249 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1250 byt = s.encode(enc, 'ignore')
1251 out.buffer.write(byt)
1257 def bytes_to_intlist(bs):
1260 if isinstance(bs[0], int): # Python 3
1263 return [ord(c) for c in bs]
1266 def intlist_to_bytes(xs):
1269 return compat_struct_pack('%dB' % len(xs), *xs)
1272 # Cross-platform file locking
1273 if sys.platform == 'win32':
1274 import ctypes.wintypes
1277 class OVERLAPPED(ctypes.Structure):
1279 ('Internal', ctypes.wintypes.LPVOID),
1280 ('InternalHigh', ctypes.wintypes.LPVOID),
1281 ('Offset', ctypes.wintypes.DWORD),
1282 ('OffsetHigh', ctypes.wintypes.DWORD),
1283 ('hEvent', ctypes.wintypes.HANDLE),
1286 kernel32 = ctypes.windll.kernel32
1287 LockFileEx = kernel32.LockFileEx
1288 LockFileEx.argtypes = [
1289 ctypes.wintypes.HANDLE, # hFile
1290 ctypes.wintypes.DWORD, # dwFlags
1291 ctypes.wintypes.DWORD, # dwReserved
1292 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1293 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1294 ctypes.POINTER(OVERLAPPED) # Overlapped
1296 LockFileEx.restype = ctypes.wintypes.BOOL
1297 UnlockFileEx = kernel32.UnlockFileEx
1298 UnlockFileEx.argtypes = [
1299 ctypes.wintypes.HANDLE, # hFile
1300 ctypes.wintypes.DWORD, # dwReserved
1301 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1302 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1303 ctypes.POINTER(OVERLAPPED) # Overlapped
1305 UnlockFileEx.restype = ctypes.wintypes.BOOL
1306 whole_low = 0xffffffff
1307 whole_high = 0x7fffffff
1309 def _lock_file(f, exclusive):
1310 overlapped = OVERLAPPED()
1311 overlapped.Offset = 0
1312 overlapped.OffsetHigh = 0
1313 overlapped.hEvent = 0
1314 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1315 handle = msvcrt.get_osfhandle(f.fileno())
1316 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1317 whole_low, whole_high, f._lock_file_overlapped_p):
1318 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1320 def _unlock_file(f):
1321 assert f._lock_file_overlapped_p
1322 handle = msvcrt.get_osfhandle(f.fileno())
1323 if not UnlockFileEx(handle, 0,
1324 whole_low, whole_high, f._lock_file_overlapped_p):
1325 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1328 # Some platforms, such as Jython, is missing fcntl
1332 def _lock_file(f, exclusive):
1333 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1335 def _unlock_file(f):
1336 fcntl.flock(f, fcntl.LOCK_UN)
1338 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1340 def _lock_file(f, exclusive):
1341 raise IOError(UNSUPPORTED_MSG)
1343 def _unlock_file(f):
1344 raise IOError(UNSUPPORTED_MSG)
1347 class locked_file(object):
1348 def __init__(self, filename, mode, encoding=None):
1349 assert mode in ['r', 'a', 'w']
1350 self.f = io.open(filename, mode, encoding=encoding)
1353 def __enter__(self):
1354 exclusive = self.mode != 'r'
1356 _lock_file(self.f, exclusive)
1362 def __exit__(self, etype, value, traceback):
1364 _unlock_file(self.f)
1371 def write(self, *args):
1372 return self.f.write(*args)
1374 def read(self, *args):
1375 return self.f.read(*args)
1378 def get_filesystem_encoding():
1379 encoding = sys.getfilesystemencoding()
1380 return encoding if encoding is not None else 'utf-8'
1383 def shell_quote(args):
1385 encoding = get_filesystem_encoding()
1387 if isinstance(a, bytes):
1388 # We may get a filename encoded with 'encodeFilename'
1389 a = a.decode(encoding)
1390 quoted_args.append(pipes.quote(a))
1391 return ' '.join(quoted_args)
1394 def smuggle_url(url, data):
1395 """ Pass additional data in a URL for internal use. """
1397 sdata = compat_urllib_parse_urlencode(
1398 {'__youtubedl_smuggle': json.dumps(data)})
1399 return url + '#' + sdata
1402 def unsmuggle_url(smug_url, default=None):
1403 if '#__youtubedl_smuggle' not in smug_url:
1404 return smug_url, default
1405 url, _, sdata = smug_url.rpartition('#')
1406 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1407 data = json.loads(jsond)
1411 def format_bytes(bytes):
1414 if type(bytes) is str:
1415 bytes = float(bytes)
1419 exponent = int(math.log(bytes, 1024.0))
1420 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1421 converted = float(bytes) / float(1024 ** exponent)
1422 return '%.2f%s' % (converted, suffix)
1425 def lookup_unit_table(unit_table, s):
1426 units_re = '|'.join(re.escape(u) for u in unit_table)
1428 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1431 num_str = m.group('num').replace(',', '.')
1432 mult = unit_table[m.group('unit')]
1433 return int(float(num_str) * mult)
1436 def parse_filesize(s):
1440 # The lower-case forms are of course incorrect and unofficial,
1441 # but we support those too
1479 return lookup_unit_table(_UNIT_TABLE, s)
1488 if re.match(r'^[\d,.]+$', s):
1489 return str_to_int(s)
1500 return lookup_unit_table(_UNIT_TABLE, s)
1503 def month_by_name(name):
1504 """ Return the number of a month by (locale-independently) English name """
1507 return ENGLISH_MONTH_NAMES.index(name) + 1
1512 def month_by_abbreviation(abbrev):
1513 """ Return the number of a month by (locale-independently) English
1517 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1522 def fix_xml_ampersands(xml_str):
1523 """Replace all the '&' by '&' in XML"""
1525 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1530 def setproctitle(title):
1531 assert isinstance(title, compat_str)
1533 # ctypes in Jython is not complete
1534 # http://bugs.jython.org/issue2148
1535 if sys.platform.startswith('java'):
1539 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1542 title_bytes = title.encode('utf-8')
1543 buf = ctypes.create_string_buffer(len(title_bytes))
1544 buf.value = title_bytes
1546 libc.prctl(15, buf, 0, 0, 0)
1547 except AttributeError:
1548 return # Strange libc, just skip this
1551 def remove_start(s, start):
1552 if s.startswith(start):
1553 return s[len(start):]
1557 def remove_end(s, end):
1559 return s[:-len(end)]
1563 def remove_quotes(s):
1564 if s is None or len(s) < 2:
1566 for quote in ('"', "'", ):
1567 if s[0] == quote and s[-1] == quote:
1572 def url_basename(url):
1573 path = compat_urlparse.urlparse(url).path
1574 return path.strip('/').split('/')[-1]
1577 class HEADRequest(compat_urllib_request.Request):
1578 def get_method(self):
1582 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1585 v = getattr(v, get_attr, None)
1591 return int(v) * invscale // scale
1596 def str_or_none(v, default=None):
1597 return default if v is None else compat_str(v)
1600 def str_to_int(int_str):
1601 """ A more relaxed version of int_or_none """
1604 int_str = re.sub(r'[,\.\+]', '', int_str)
1608 def float_or_none(v, scale=1, invscale=1, default=None):
1612 return float(v) * invscale / scale
1617 def parse_duration(s):
1618 if not isinstance(s, compat_basestring):
1623 days, hours, mins, secs, ms = [None] * 5
1624 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1626 days, hours, mins, secs, ms = m.groups()
1631 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1634 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1637 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1640 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1643 days, hours, mins, secs, ms = m.groups()
1645 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1647 hours, mins = m.groups()
1653 duration += float(secs)
1655 duration += float(mins) * 60
1657 duration += float(hours) * 60 * 60
1659 duration += float(days) * 24 * 60 * 60
1661 duration += float(ms)
1665 def prepend_extension(filename, ext, expected_real_ext=None):
1666 name, real_ext = os.path.splitext(filename)
1668 '{0}.{1}{2}'.format(name, ext, real_ext)
1669 if not expected_real_ext or real_ext[1:] == expected_real_ext
1670 else '{0}.{1}'.format(filename, ext))
1673 def replace_extension(filename, ext, expected_real_ext=None):
1674 name, real_ext = os.path.splitext(filename)
1675 return '{0}.{1}'.format(
1676 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1680 def check_executable(exe, args=[]):
1681 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1682 args can be a list of arguments for a short output (like -version) """
1684 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1690 def get_exe_version(exe, args=['--version'],
1691 version_re=None, unrecognized='present'):
1692 """ Returns the version of the specified executable,
1693 or False if the executable is not present """
1695 out, _ = subprocess.Popen(
1696 [encodeArgument(exe)] + args,
1697 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1700 if isinstance(out, bytes): # Python 2.x
1701 out = out.decode('ascii', 'ignore')
1702 return detect_exe_version(out, version_re, unrecognized)
1705 def detect_exe_version(output, version_re=None, unrecognized='present'):
1706 assert isinstance(output, compat_str)
1707 if version_re is None:
1708 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1709 m = re.search(version_re, output)
1716 class PagedList(object):
1718 # This is only useful for tests
1719 return len(self.getslice())
1722 class OnDemandPagedList(PagedList):
1723 def __init__(self, pagefunc, pagesize, use_cache=False):
1724 self._pagefunc = pagefunc
1725 self._pagesize = pagesize
1726 self._use_cache = use_cache
1730 def getslice(self, start=0, end=None):
1732 for pagenum in itertools.count(start // self._pagesize):
1733 firstid = pagenum * self._pagesize
1734 nextfirstid = pagenum * self._pagesize + self._pagesize
1735 if start >= nextfirstid:
1740 page_results = self._cache.get(pagenum)
1741 if page_results is None:
1742 page_results = list(self._pagefunc(pagenum))
1744 self._cache[pagenum] = page_results
1747 start % self._pagesize
1748 if firstid <= start < nextfirstid
1752 ((end - 1) % self._pagesize) + 1
1753 if (end is not None and firstid <= end <= nextfirstid)
1756 if startv != 0 or endv is not None:
1757 page_results = page_results[startv:endv]
1758 res.extend(page_results)
1760 # A little optimization - if current page is not "full", ie. does
1761 # not contain page_size videos then we can assume that this page
1762 # is the last one - there are no more ids on further pages -
1763 # i.e. no need to query again.
1764 if len(page_results) + startv < self._pagesize:
1767 # If we got the whole page, but the next page is not interesting,
1768 # break out early as well
1769 if end == nextfirstid:
1774 class InAdvancePagedList(PagedList):
1775 def __init__(self, pagefunc, pagecount, pagesize):
1776 self._pagefunc = pagefunc
1777 self._pagecount = pagecount
1778 self._pagesize = pagesize
1780 def getslice(self, start=0, end=None):
1782 start_page = start // self._pagesize
1784 self._pagecount if end is None else (end // self._pagesize + 1))
1785 skip_elems = start - start_page * self._pagesize
1786 only_more = None if end is None else end - start
1787 for pagenum in range(start_page, end_page):
1788 page = list(self._pagefunc(pagenum))
1790 page = page[skip_elems:]
1792 if only_more is not None:
1793 if len(page) < only_more:
1794 only_more -= len(page)
1796 page = page[:only_more]
1803 def uppercase_escape(s):
1804 unicode_escape = codecs.getdecoder('unicode_escape')
1806 r'\\U[0-9a-fA-F]{8}',
1807 lambda m: unicode_escape(m.group(0))[0],
1811 def lowercase_escape(s):
1812 unicode_escape = codecs.getdecoder('unicode_escape')
1814 r'\\u[0-9a-fA-F]{4}',
1815 lambda m: unicode_escape(m.group(0))[0],
1819 def escape_rfc3986(s):
1820 """Escape non-ASCII characters as suggested by RFC 3986"""
1821 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1822 s = s.encode('utf-8')
1823 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1826 def escape_url(url):
1827 """Escape URL as suggested by RFC 3986"""
1828 url_parsed = compat_urllib_parse_urlparse(url)
1829 return url_parsed._replace(
1830 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1831 path=escape_rfc3986(url_parsed.path),
1832 params=escape_rfc3986(url_parsed.params),
1833 query=escape_rfc3986(url_parsed.query),
1834 fragment=escape_rfc3986(url_parsed.fragment)
1838 def read_batch_urls(batch_fd):
1840 if not isinstance(url, compat_str):
1841 url = url.decode('utf-8', 'replace')
1842 BOM_UTF8 = '\xef\xbb\xbf'
1843 if url.startswith(BOM_UTF8):
1844 url = url[len(BOM_UTF8):]
1846 if url.startswith(('#', ';', ']')):
1850 with contextlib.closing(batch_fd) as fd:
1851 return [url for url in map(fixup, fd) if url]
1854 def urlencode_postdata(*args, **kargs):
1855 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1858 def update_url_query(url, query):
1861 parsed_url = compat_urlparse.urlparse(url)
1862 qs = compat_parse_qs(parsed_url.query)
1864 return compat_urlparse.urlunparse(parsed_url._replace(
1865 query=compat_urllib_parse_urlencode(qs, True)))
1868 def update_Request(req, url=None, data=None, headers={}, query={}):
1869 req_headers = req.headers.copy()
1870 req_headers.update(headers)
1871 req_data = data or req.data
1872 req_url = update_url_query(url or req.get_full_url(), query)
1873 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1875 req_url, data=req_data, headers=req_headers,
1876 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1877 if hasattr(req, 'timeout'):
1878 new_req.timeout = req.timeout
1882 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1883 if isinstance(key_or_keys, (list, tuple)):
1884 for key in key_or_keys:
1885 if key not in d or d[key] is None or skip_false_values and not d[key]:
1889 return d.get(key_or_keys, default)
1892 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1893 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1905 def parse_age_limit(s):
1908 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1909 return int(m.group('age')) if m else US_RATINGS.get(s)
1912 def strip_jsonp(code):
1914 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1917 def js_to_json(code):
1920 if v in ('true', 'false', 'null'):
1922 elif v.startswith('/*') or v == ',':
1925 if v[0] in ("'", '"'):
1926 v = re.sub(r'(?s)\\.|"', lambda m: {
1931 }.get(m.group(0), m.group(0)), v[1:-1])
1934 (r'^0[xX][0-9a-fA-F]+', 16),
1938 for regex, base in INTEGER_TABLE:
1939 im = re.match(regex, v)
1941 i = int(im.group(0), base)
1942 return '"%d":' % i if v.endswith(':') else '%d' % i
1946 return re.sub(r'''(?sx)
1947 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1948 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1949 /\*.*?\*/|,(?=\s*[\]}])|
1950 [a-zA-Z_][.a-zA-Z_0-9]*|
1951 (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
1956 def qualities(quality_ids):
1957 """ Get a numeric quality value out of a list of possible values """
1960 return quality_ids.index(qid)
1966 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1969 def limit_length(s, length):
1970 """ Add ellipses to overly long strings """
1975 return s[:length - len(ELLIPSES)] + ELLIPSES
1979 def version_tuple(v):
1980 return tuple(int(e) for e in re.split(r'[-.]', v))
1983 def is_outdated_version(version, limit, assume_new=True):
1985 return not assume_new
1987 return version_tuple(version) < version_tuple(limit)
1989 return not assume_new
1992 def ytdl_is_updateable():
1993 """ Returns if youtube-dl can be updated with -U """
1994 from zipimport import zipimporter
1996 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1999 def args_to_str(args):
2000 # Get a short string representation for a subprocess command
2001 return ' '.join(compat_shlex_quote(a) for a in args)
2004 def error_to_compat_str(err):
2006 # On python 2 error byte string must be decoded with proper
2007 # encoding rather than ascii
2008 if sys.version_info[0] < 3:
2009 err_str = err_str.decode(preferredencoding())
2013 def mimetype2ext(mt):
2023 _, _, res = mt.rpartition('/')
2027 'smptett+xml': 'tt',
2033 'x-mp4-fragmented': 'mp4',
2038 def urlhandle_detect_ext(url_handle):
2039 getheader = url_handle.headers.get
2041 cd = getheader('Content-Disposition')
2043 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2045 e = determine_ext(m.group('filename'), default_ext=None)
2049 return mimetype2ext(getheader('Content-Type'))
2052 def encode_data_uri(data, mime_type):
2053 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2056 def age_restricted(content_limit, age_limit):
2057 """ Returns True iff the content should be blocked """
2059 if age_limit is None: # No limit set
2061 if content_limit is None:
2062 return False # Content available for everyone
2063 return age_limit < content_limit
2066 def is_html(first_bytes):
2067 """ Detect whether a file contains HTML by examining its first bytes. """
2070 (b'\xef\xbb\xbf', 'utf-8'),
2071 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2072 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2073 (b'\xff\xfe', 'utf-16-le'),
2074 (b'\xfe\xff', 'utf-16-be'),
2076 for bom, enc in BOMS:
2077 if first_bytes.startswith(bom):
2078 s = first_bytes[len(bom):].decode(enc, 'replace')
2081 s = first_bytes.decode('utf-8', 'replace')
2083 return re.match(r'^\s*<', s)
2086 def determine_protocol(info_dict):
2087 protocol = info_dict.get('protocol')
2088 if protocol is not None:
2091 url = info_dict['url']
2092 if url.startswith('rtmp'):
2094 elif url.startswith('mms'):
2096 elif url.startswith('rtsp'):
2099 ext = determine_ext(url)
2105 return compat_urllib_parse_urlparse(url).scheme
2108 def render_table(header_row, data):
2109 """ Render a list of rows, each as a list of values """
2110 table = [header_row] + data
2111 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2112 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2113 return '\n'.join(format_str % tuple(row) for row in table)
2116 def _match_one(filter_part, dct):
2117 COMPARISON_OPERATORS = {
2125 operator_rex = re.compile(r'''(?x)\s*
2127 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2129 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2130 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2133 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2134 m = operator_rex.search(filter_part)
2136 op = COMPARISON_OPERATORS[m.group('op')]
2137 if m.group('strval') is not None:
2138 if m.group('op') not in ('=', '!='):
2140 'Operator %s does not support string values!' % m.group('op'))
2141 comparison_value = m.group('strval')
2144 comparison_value = int(m.group('intval'))
2146 comparison_value = parse_filesize(m.group('intval'))
2147 if comparison_value is None:
2148 comparison_value = parse_filesize(m.group('intval') + 'B')
2149 if comparison_value is None:
2151 'Invalid integer value %r in filter part %r' % (
2152 m.group('intval'), filter_part))
2153 actual_value = dct.get(m.group('key'))
2154 if actual_value is None:
2155 return m.group('none_inclusive')
2156 return op(actual_value, comparison_value)
2159 '': lambda v: v is not None,
2160 '!': lambda v: v is None,
2162 operator_rex = re.compile(r'''(?x)\s*
2163 (?P<op>%s)\s*(?P<key>[a-z_]+)
2165 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2166 m = operator_rex.search(filter_part)
2168 op = UNARY_OPERATORS[m.group('op')]
2169 actual_value = dct.get(m.group('key'))
2170 return op(actual_value)
2172 raise ValueError('Invalid filter part %r' % filter_part)
2175 def match_str(filter_str, dct):
2176 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2179 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2182 def match_filter_func(filter_str):
2183 def _match_func(info_dict):
2184 if match_str(filter_str, info_dict):
2187 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2188 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2192 def parse_dfxp_time_expr(time_expr):
2196 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2198 return float(mobj.group('time_offset'))
2200 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2202 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2205 def srt_subtitles_timecode(seconds):
2206 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2209 def dfxp2srt(dfxp_data):
2210 _x = functools.partial(xpath_with_ns, ns_map={
2211 'ttml': 'http://www.w3.org/ns/ttml',
2212 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2213 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2216 class TTMLPElementParser(object):
2219 def start(self, tag, attrib):
2220 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2226 def data(self, data):
2230 return self.out.strip()
2232 def parse_node(node):
2233 target = TTMLPElementParser()
2234 parser = xml.etree.ElementTree.XMLParser(target=target)
2235 parser.feed(xml.etree.ElementTree.tostring(node))
2236 return parser.close()
2238 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2240 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2243 raise ValueError('Invalid dfxp/TTML subtitle')
2245 for para, index in zip(paras, itertools.count(1)):
2246 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2247 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2248 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2249 if begin_time is None:
2254 end_time = begin_time + dur
2255 out.append('%d\n%s --> %s\n%s\n\n' % (
2257 srt_subtitles_timecode(begin_time),
2258 srt_subtitles_timecode(end_time),
2264 def cli_option(params, command_option, param):
2265 param = params.get(param)
2266 return [command_option, param] if param is not None else []
2269 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2270 param = params.get(param)
2271 assert isinstance(param, bool)
2273 return [command_option + separator + (true_value if param else false_value)]
2274 return [command_option, true_value if param else false_value]
2277 def cli_valueless_option(params, command_option, param, expected_value=True):
2278 param = params.get(param)
2279 return [command_option] if param == expected_value else []
2282 def cli_configuration_args(params, param, default=[]):
2283 ex_args = params.get(param)
2286 assert isinstance(ex_args, list)
2290 class ISO639Utils(object):
2291 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2480 def short2long(cls, code):
2481 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2482 return cls._lang_map.get(code[:2])
2485 def long2short(cls, code):
2486 """Convert language code from ISO 639-2/T to ISO 639-1"""
2487 for short_name, long_name in cls._lang_map.items():
2488 if long_name == code:
2492 class ISO3166Utils(object):
2493 # From http://data.okfn.org/data/core/country-list
2495 'AF': 'Afghanistan',
2496 'AX': 'Åland Islands',
2499 'AS': 'American Samoa',
2504 'AG': 'Antigua and Barbuda',
2521 'BO': 'Bolivia, Plurinational State of',
2522 'BQ': 'Bonaire, Sint Eustatius and Saba',
2523 'BA': 'Bosnia and Herzegovina',
2525 'BV': 'Bouvet Island',
2527 'IO': 'British Indian Ocean Territory',
2528 'BN': 'Brunei Darussalam',
2530 'BF': 'Burkina Faso',
2536 'KY': 'Cayman Islands',
2537 'CF': 'Central African Republic',
2541 'CX': 'Christmas Island',
2542 'CC': 'Cocos (Keeling) Islands',
2546 'CD': 'Congo, the Democratic Republic of the',
2547 'CK': 'Cook Islands',
2549 'CI': 'Côte d\'Ivoire',
2554 'CZ': 'Czech Republic',
2558 'DO': 'Dominican Republic',
2561 'SV': 'El Salvador',
2562 'GQ': 'Equatorial Guinea',
2566 'FK': 'Falkland Islands (Malvinas)',
2567 'FO': 'Faroe Islands',
2571 'GF': 'French Guiana',
2572 'PF': 'French Polynesia',
2573 'TF': 'French Southern Territories',
2588 'GW': 'Guinea-Bissau',
2591 'HM': 'Heard Island and McDonald Islands',
2592 'VA': 'Holy See (Vatican City State)',
2599 'IR': 'Iran, Islamic Republic of',
2602 'IM': 'Isle of Man',
2612 'KP': 'Korea, Democratic People\'s Republic of',
2613 'KR': 'Korea, Republic of',
2616 'LA': 'Lao People\'s Democratic Republic',
2622 'LI': 'Liechtenstein',
2626 'MK': 'Macedonia, the Former Yugoslav Republic of',
2633 'MH': 'Marshall Islands',
2639 'FM': 'Micronesia, Federated States of',
2640 'MD': 'Moldova, Republic of',
2651 'NL': 'Netherlands',
2652 'NC': 'New Caledonia',
2653 'NZ': 'New Zealand',
2658 'NF': 'Norfolk Island',
2659 'MP': 'Northern Mariana Islands',
2664 'PS': 'Palestine, State of',
2666 'PG': 'Papua New Guinea',
2669 'PH': 'Philippines',
2673 'PR': 'Puerto Rico',
2677 'RU': 'Russian Federation',
2679 'BL': 'Saint Barthélemy',
2680 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2681 'KN': 'Saint Kitts and Nevis',
2682 'LC': 'Saint Lucia',
2683 'MF': 'Saint Martin (French part)',
2684 'PM': 'Saint Pierre and Miquelon',
2685 'VC': 'Saint Vincent and the Grenadines',
2688 'ST': 'Sao Tome and Principe',
2689 'SA': 'Saudi Arabia',
2693 'SL': 'Sierra Leone',
2695 'SX': 'Sint Maarten (Dutch part)',
2698 'SB': 'Solomon Islands',
2700 'ZA': 'South Africa',
2701 'GS': 'South Georgia and the South Sandwich Islands',
2702 'SS': 'South Sudan',
2707 'SJ': 'Svalbard and Jan Mayen',
2710 'CH': 'Switzerland',
2711 'SY': 'Syrian Arab Republic',
2712 'TW': 'Taiwan, Province of China',
2714 'TZ': 'Tanzania, United Republic of',
2716 'TL': 'Timor-Leste',
2720 'TT': 'Trinidad and Tobago',
2723 'TM': 'Turkmenistan',
2724 'TC': 'Turks and Caicos Islands',
2728 'AE': 'United Arab Emirates',
2729 'GB': 'United Kingdom',
2730 'US': 'United States',
2731 'UM': 'United States Minor Outlying Islands',
2735 'VE': 'Venezuela, Bolivarian Republic of',
2737 'VG': 'Virgin Islands, British',
2738 'VI': 'Virgin Islands, U.S.',
2739 'WF': 'Wallis and Futuna',
2740 'EH': 'Western Sahara',
2747 def short2full(cls, code):
2748 """Convert an ISO 3166-2 country code to the corresponding full name"""
2749 return cls._country_map.get(code.upper())
2752 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2753 def __init__(self, proxies=None):
2754 # Set default handlers
2755 for type in ('http', 'https'):
2756 setattr(self, '%s_open' % type,
2757 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2758 meth(r, proxy, type))
2759 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2761 def proxy_open(self, req, proxy, type):
2762 req_proxy = req.headers.get('Ytdl-request-proxy')
2763 if req_proxy is not None:
2765 del req.headers['Ytdl-request-proxy']
2767 if proxy == '__noproxy__':
2768 return None # No Proxy
2769 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2770 req.add_header('Ytdl-socks-proxy', proxy)
2771 # youtube-dl's http/https handlers do wrapping the socket with socks
2773 return compat_urllib_request.ProxyHandler.proxy_open(
2774 self, req, proxy, type)
2777 def ohdave_rsa_encrypt(data, exponent, modulus):
2779 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2782 data: data to encrypt, bytes-like object
2783 exponent, modulus: parameter e and N of RSA algorithm, both integer
2784 Output: hex string of encrypted data
2786 Limitation: supports one block encryption only
2789 payload = int(binascii.hexlify(data[::-1]), 16)
2790 encrypted = pow(payload, exponent, modulus)
2791 return '%x' % encrypted
2794 def encode_base_n(num, n, table=None):
2795 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2797 table = FULL_TABLE[:n]
2800 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2807 ret = table[num % n] + ret
2812 def decode_packed_codes(code):
2814 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2816 obfucasted_code, base, count, symbols = mobj.groups()
2819 symbols = symbols.split('|')
2824 base_n_count = encode_base_n(count, base)
2825 symbol_table[base_n_count] = symbols[count] or base_n_count
2828 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],