2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
45 compat_socket_create_connection,
49 compat_urllib_parse_urlencode,
50 compat_urllib_parse_urlparse,
51 compat_urllib_request,
59 # This is not clearly defined otherwise
60 compiled_regex_type = type(re.compile(''))
63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
73 ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
90 'f4f', 'f4m', 'm3u8', 'smil')
92 # needed for sanitizing filenames in restricted mode
93 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
94 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
95 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
98 def preferredencoding():
99 """Get preferred encoding.
101 Returns the best encoding scheme for the system, based on
102 locale.getpreferredencoding() and some further tweaks.
105 pref = locale.getpreferredencoding()
113 def write_json_file(obj, fn):
114 """ Encode obj as JSON and write it to fn, atomically if possible """
116 fn = encodeFilename(fn)
117 if sys.version_info < (3, 0) and sys.platform != 'win32':
118 encoding = get_filesystem_encoding()
119 # os.path.basename returns a bytes object, but NamedTemporaryFile
120 # will fail if the filename contains non ascii characters unless we
121 # use a unicode object
122 path_basename = lambda f: os.path.basename(fn).decode(encoding)
123 # the same for os.path.dirname
124 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
126 path_basename = os.path.basename
127 path_dirname = os.path.dirname
131 'prefix': path_basename(fn) + '.',
132 'dir': path_dirname(fn),
136 # In Python 2.x, json.dump expects a bytestream.
137 # In Python 3.x, it writes to a character stream
138 if sys.version_info < (3, 0):
146 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
151 if sys.platform == 'win32':
152 # Need to remove existing file on Windows, else os.rename raises
153 # WindowsError or FileExistsError.
158 os.rename(tf.name, fn)
167 if sys.version_info >= (2, 7):
168 def find_xpath_attr(node, xpath, key, val=None):
169 """ Find the xpath xpath[@key=val] """
170 assert re.match(r'^[a-zA-Z_-]+$', key)
171 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
172 return node.find(expr)
174 def find_xpath_attr(node, xpath, key, val=None):
175 for f in node.findall(compat_xpath(xpath)):
176 if key not in f.attrib:
178 if val is None or f.attrib.get(key) == val:
182 # On python2.6 the xml.etree.ElementTree.Element methods don't support
183 # the namespace parameter
186 def xpath_with_ns(path, ns_map):
187 components = [c.split(':') for c in path.split('/')]
191 replaced.append(c[0])
194 replaced.append('{%s}%s' % (ns_map[ns], tag))
195 return '/'.join(replaced)
198 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
199 def _find_xpath(xpath):
200 return node.find(compat_xpath(xpath))
202 if isinstance(xpath, (str, compat_str)):
203 n = _find_xpath(xpath)
211 if default is not NO_DEFAULT:
214 name = xpath if name is None else name
215 raise ExtractorError('Could not find XML element %s' % name)
221 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
222 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
223 if n is None or n == default:
226 if default is not NO_DEFAULT:
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element\'s text %s' % name)
236 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
237 n = find_xpath_attr(node, xpath, key)
239 if default is not NO_DEFAULT:
242 name = '%s[@%s]' % (xpath, key) if name is None else name
243 raise ExtractorError('Could not find XML attribute %s' % name)
249 def get_element_by_id(id, html):
250 """Return the content of the tag with the specified ID in the passed HTML document"""
251 return get_element_by_attribute('id', id, html)
254 def get_element_by_attribute(attribute, value, html):
255 """Return the content of the tag with the specified attribute in the passed HTML document"""
257 m = re.search(r'''(?xs)
259 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
261 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
265 ''' % (re.escape(attribute), re.escape(value)), html)
269 res = m.group('content')
271 if res.startswith('"') or res.startswith("'"):
274 return unescapeHTML(res)
277 class HTMLAttributeParser(compat_HTMLParser):
278 """Trivial HTML parser to gather the attributes for a single element"""
281 compat_HTMLParser.__init__(self)
283 def handle_starttag(self, tag, attrs):
284 self.attrs = dict(attrs)
287 def extract_attributes(html_element):
288 """Given a string for an HTML element such as
290 a="foo" B="bar" c="&98;az" d=boz
291 empty= noval entity="&"
294 Decode and return a dictionary of attributes.
296 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
297 'empty': '', 'noval': None, 'entity': '&',
298 'sq': '"', 'dq': '\''
300 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
301 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
303 parser = HTMLAttributeParser()
304 parser.feed(html_element)
309 def clean_html(html):
310 """Clean an HTML snippet into a readable string"""
312 if html is None: # Convenience for sanitizing descriptions etc.
316 html = html.replace('\n', ' ')
317 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
318 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
320 html = re.sub('<.*?>', '', html)
321 # Replace html entities
322 html = unescapeHTML(html)
326 def sanitize_open(filename, open_mode):
327 """Try to open the given filename, and slightly tweak it if this fails.
329 Attempts to open the given filename. If this fails, it tries to change
330 the filename slightly, step by step, until it's either able to open it
331 or it fails and raises a final exception, like the standard open()
334 It returns the tuple (stream, definitive_file_name).
338 if sys.platform == 'win32':
340 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
341 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
342 stream = open(encodeFilename(filename), open_mode)
343 return (stream, filename)
344 except (IOError, OSError) as err:
345 if err.errno in (errno.EACCES,):
348 # In case of error, try to remove win32 forbidden chars
349 alt_filename = sanitize_path(filename)
350 if alt_filename == filename:
353 # An exception here should be caught in the caller
354 stream = open(encodeFilename(alt_filename), open_mode)
355 return (stream, alt_filename)
358 def timeconvert(timestr):
359 """Convert RFC 2822 defined time string into system timestamp"""
361 timetuple = email.utils.parsedate_tz(timestr)
362 if timetuple is not None:
363 timestamp = email.utils.mktime_tz(timetuple)
367 def sanitize_filename(s, restricted=False, is_id=False):
368 """Sanitizes a string so it could be used as part of a filename.
369 If restricted is set, use a stricter subset of allowed characters.
370 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
372 def replace_insane(char):
373 if restricted and char in ACCENT_CHARS:
374 return ACCENT_CHARS[char]
375 if char == '?' or ord(char) < 32 or ord(char) == 127:
378 return '' if restricted else '\''
380 return '_-' if restricted else ' -'
381 elif char in '\\/|*<>':
383 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
385 if restricted and ord(char) > 127:
390 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
391 result = ''.join(map(replace_insane, s))
393 while '__' in result:
394 result = result.replace('__', '_')
395 result = result.strip('_')
396 # Common case of "Foreign band name - English song title"
397 if restricted and result.startswith('-_'):
399 if result.startswith('-'):
400 result = '_' + result[len('-'):]
401 result = result.lstrip('.')
407 def sanitize_path(s):
408 """Sanitizes and normalizes path on Windows"""
409 if sys.platform != 'win32':
411 drive_or_unc, _ = os.path.splitdrive(s)
412 if sys.version_info < (2, 7) and not drive_or_unc:
413 drive_or_unc, _ = os.path.splitunc(s)
414 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
418 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
419 for path_part in norm_path]
421 sanitized_path.insert(0, drive_or_unc + os.path.sep)
422 return os.path.join(*sanitized_path)
425 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
426 # unwanted failures due to missing protocol
427 def sanitize_url(url):
428 return 'http:%s' % url if url.startswith('//') else url
431 def sanitized_Request(url, *args, **kwargs):
432 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
435 def orderedSet(iterable):
436 """ Remove all duplicates from the input iterable """
444 def _htmlentity_transform(entity):
445 """Transforms an HTML entity to a character."""
446 # Known non-numeric HTML entity
447 if entity in compat_html_entities.name2codepoint:
448 return compat_chr(compat_html_entities.name2codepoint[entity])
450 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
452 numstr = mobj.group(1)
453 if numstr.startswith('x'):
455 numstr = '0%s' % numstr
458 # See https://github.com/rg3/youtube-dl/issues/7518
460 return compat_chr(int(numstr, base))
464 # Unknown entity in name, return its literal representation
465 return '&%s;' % entity
471 assert type(s) == compat_str
474 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
477 def get_subprocess_encoding():
478 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
479 # For subprocess calls, encode with locale encoding
480 # Refer to http://stackoverflow.com/a/9951851/35070
481 encoding = preferredencoding()
483 encoding = sys.getfilesystemencoding()
489 def encodeFilename(s, for_subprocess=False):
491 @param s The name of the file
494 assert type(s) == compat_str
496 # Python 3 has a Unicode API
497 if sys.version_info >= (3, 0):
500 # Pass '' directly to use Unicode APIs on Windows 2000 and up
501 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
502 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
503 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
506 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
507 if sys.platform.startswith('java'):
510 return s.encode(get_subprocess_encoding(), 'ignore')
513 def decodeFilename(b, for_subprocess=False):
515 if sys.version_info >= (3, 0):
518 if not isinstance(b, bytes):
521 return b.decode(get_subprocess_encoding(), 'ignore')
524 def encodeArgument(s):
525 if not isinstance(s, compat_str):
526 # Legacy code that uses byte strings
527 # Uncomment the following line after fixing all post processors
528 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
529 s = s.decode('ascii')
530 return encodeFilename(s, True)
533 def decodeArgument(b):
534 return decodeFilename(b, True)
537 def decodeOption(optval):
540 if isinstance(optval, bytes):
541 optval = optval.decode(preferredencoding())
543 assert isinstance(optval, compat_str)
547 def formatSeconds(secs):
549 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
551 return '%d:%02d' % (secs // 60, secs % 60)
556 def make_HTTPS_handler(params, **kwargs):
557 opts_no_check_certificate = params.get('nocheckcertificate', False)
558 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
559 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
560 if opts_no_check_certificate:
561 context.check_hostname = False
562 context.verify_mode = ssl.CERT_NONE
564 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
567 # (create_default_context present but HTTPSHandler has no context=)
570 if sys.version_info < (3, 2):
571 return YoutubeDLHTTPSHandler(params, **kwargs)
573 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
574 context.verify_mode = (ssl.CERT_NONE
575 if opts_no_check_certificate
576 else ssl.CERT_REQUIRED)
577 context.set_default_verify_paths()
578 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
581 def bug_reports_message():
582 if ytdl_is_updateable():
583 update_cmd = 'type youtube-dl -U to update'
585 update_cmd = 'see https://yt-dl.org/update on how to update'
586 msg = '; please report this issue on https://yt-dl.org/bug .'
587 msg += ' Make sure you are using the latest version; %s.' % update_cmd
588 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
592 class ExtractorError(Exception):
593 """Error during info extraction."""
595 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
596 """ tb, if given, is the original traceback (so that it can be printed out).
597 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
600 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
602 if video_id is not None:
603 msg = video_id + ': ' + msg
605 msg += ' (caused by %r)' % cause
607 msg += bug_reports_message()
608 super(ExtractorError, self).__init__(msg)
611 self.exc_info = sys.exc_info() # preserve original exception
613 self.video_id = video_id
615 def format_traceback(self):
616 if self.traceback is None:
618 return ''.join(traceback.format_tb(self.traceback))
621 class UnsupportedError(ExtractorError):
622 def __init__(self, url):
623 super(UnsupportedError, self).__init__(
624 'Unsupported URL: %s' % url, expected=True)
628 class RegexNotFoundError(ExtractorError):
629 """Error when a regex didn't match"""
633 class DownloadError(Exception):
634 """Download Error exception.
636 This exception may be thrown by FileDownloader objects if they are not
637 configured to continue on errors. They will contain the appropriate
641 def __init__(self, msg, exc_info=None):
642 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
643 super(DownloadError, self).__init__(msg)
644 self.exc_info = exc_info
647 class SameFileError(Exception):
648 """Same File exception.
650 This exception will be thrown by FileDownloader objects if they detect
651 multiple files would have to be downloaded to the same file on disk.
656 class PostProcessingError(Exception):
657 """Post Processing exception.
659 This exception may be raised by PostProcessor's .run() method to
660 indicate an error in the postprocessing task.
663 def __init__(self, msg):
667 class MaxDownloadsReached(Exception):
668 """ --max-downloads limit has been reached. """
672 class UnavailableVideoError(Exception):
673 """Unavailable Format exception.
675 This exception will be thrown when a video is requested
676 in a format that is not available for that video.
681 class ContentTooShortError(Exception):
682 """Content Too Short exception.
684 This exception may be raised by FileDownloader objects when a file they
685 download is too small for what the server announced first, indicating
686 the connection was probably interrupted.
689 def __init__(self, downloaded, expected):
691 self.downloaded = downloaded
692 self.expected = expected
695 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
696 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
697 # expected HTTP responses to meet HTTP/1.0 or later (see also
698 # https://github.com/rg3/youtube-dl/issues/6727)
699 if sys.version_info < (3, 0):
700 kwargs[b'strict'] = True
701 hc = http_class(*args, **kwargs)
702 source_address = ydl_handler._params.get('source_address')
703 if source_address is not None:
704 sa = (source_address, 0)
705 if hasattr(hc, 'source_address'): # Python 2.7+
706 hc.source_address = sa
708 def _hc_connect(self, *args, **kwargs):
709 sock = compat_socket_create_connection(
710 (self.host, self.port), self.timeout, sa)
712 self.sock = ssl.wrap_socket(
713 sock, self.key_file, self.cert_file,
714 ssl_version=ssl.PROTOCOL_TLSv1)
717 hc.connect = functools.partial(_hc_connect, hc)
722 def handle_youtubedl_headers(headers):
723 filtered_headers = headers
725 if 'Youtubedl-no-compression' in filtered_headers:
726 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
727 del filtered_headers['Youtubedl-no-compression']
729 return filtered_headers
732 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
733 """Handler for HTTP requests and responses.
735 This class, when installed with an OpenerDirector, automatically adds
736 the standard headers to every HTTP request and handles gzipped and
737 deflated responses from web servers. If compression is to be avoided in
738 a particular request, the original request in the program code only has
739 to include the HTTP header "Youtubedl-no-compression", which will be
740 removed before making the real request.
742 Part of this code was copied from:
744 http://techknack.net/python-urllib2-handlers/
746 Andrew Rowls, the author of that code, agreed to release it to the
750 def __init__(self, params, *args, **kwargs):
751 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
752 self._params = params
754 def http_open(self, req):
755 return self.do_open(functools.partial(
756 _create_http_connection, self, compat_http_client.HTTPConnection, False),
762 return zlib.decompress(data, -zlib.MAX_WBITS)
764 return zlib.decompress(data)
767 def addinfourl_wrapper(stream, headers, url, code):
768 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
769 return compat_urllib_request.addinfourl(stream, headers, url, code)
770 ret = compat_urllib_request.addinfourl(stream, headers, url)
774 def http_request(self, req):
775 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
776 # always respected by websites, some tend to give out URLs with non percent-encoded
777 # non-ASCII characters (see telemb.py, ard.py [#3412])
778 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
779 # To work around aforementioned issue we will replace request's original URL with
780 # percent-encoded one
781 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
782 # the code of this workaround has been moved here from YoutubeDL.urlopen()
783 url = req.get_full_url()
784 url_escaped = escape_url(url)
786 # Substitute URL if any change after escaping
787 if url != url_escaped:
788 req = update_Request(req, url=url_escaped)
790 for h, v in std_headers.items():
791 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
792 # The dict keys are capitalized because of this bug by urllib
793 if h.capitalize() not in req.headers:
796 req.headers = handle_youtubedl_headers(req.headers)
798 if sys.version_info < (2, 7) and '#' in req.get_full_url():
799 # Python 2.6 is brain-dead when it comes to fragments
800 req._Request__original = req._Request__original.partition('#')[0]
801 req._Request__r_type = req._Request__r_type.partition('#')[0]
805 def http_response(self, req, resp):
808 if resp.headers.get('Content-encoding', '') == 'gzip':
809 content = resp.read()
810 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
812 uncompressed = io.BytesIO(gz.read())
813 except IOError as original_ioerror:
814 # There may be junk add the end of the file
815 # See http://stackoverflow.com/q/4928560/35070 for details
816 for i in range(1, 1024):
818 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
819 uncompressed = io.BytesIO(gz.read())
824 raise original_ioerror
825 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
826 resp.msg = old_resp.msg
827 del resp.headers['Content-encoding']
829 if resp.headers.get('Content-encoding', '') == 'deflate':
830 gz = io.BytesIO(self.deflate(resp.read()))
831 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
832 resp.msg = old_resp.msg
833 del resp.headers['Content-encoding']
834 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
835 # https://github.com/rg3/youtube-dl/issues/6457).
836 if 300 <= resp.code < 400:
837 location = resp.headers.get('Location')
839 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
840 if sys.version_info >= (3, 0):
841 location = location.encode('iso-8859-1').decode('utf-8')
842 location_escaped = escape_url(location)
843 if location != location_escaped:
844 del resp.headers['Location']
845 resp.headers['Location'] = location_escaped
848 https_request = http_request
849 https_response = http_response
852 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
853 def __init__(self, params, https_conn_class=None, *args, **kwargs):
854 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
855 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
856 self._params = params
858 def https_open(self, req):
860 if hasattr(self, '_context'): # python > 2.6
861 kwargs['context'] = self._context
862 if hasattr(self, '_check_hostname'): # python 3.x
863 kwargs['check_hostname'] = self._check_hostname
864 return self.do_open(functools.partial(
865 _create_http_connection, self, self._https_conn_class, True),
869 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
870 def __init__(self, cookiejar=None):
871 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
873 def http_response(self, request, response):
874 # Python 2 will choke on next HTTP request in row if there are non-ASCII
875 # characters in Set-Cookie HTTP header of last response (see
876 # https://github.com/rg3/youtube-dl/issues/6769).
877 # In order to at least prevent crashing we will percent encode Set-Cookie
878 # header before HTTPCookieProcessor starts processing it.
879 # if sys.version_info < (3, 0) and response.headers:
880 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
881 # set_cookie = response.headers.get(set_cookie_header)
883 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
884 # if set_cookie != set_cookie_escaped:
885 # del response.headers[set_cookie_header]
886 # response.headers[set_cookie_header] = set_cookie_escaped
887 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
889 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
890 https_response = http_response
893 def parse_iso8601(date_str, delimiter='T', timezone=None):
894 """ Return a UNIX timestamp from the given date """
899 date_str = re.sub(r'\.[0-9]+', '', date_str)
903 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
906 timezone = datetime.timedelta()
908 date_str = date_str[:-len(m.group(0))]
909 if not m.group('sign'):
910 timezone = datetime.timedelta()
912 sign = 1 if m.group('sign') == '+' else -1
913 timezone = datetime.timedelta(
914 hours=sign * int(m.group('hours')),
915 minutes=sign * int(m.group('minutes')))
917 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
918 dt = datetime.datetime.strptime(date_str, date_format) - timezone
919 return calendar.timegm(dt.timetuple())
924 def unified_strdate(date_str, day_first=True):
925 """Return a string with the date in the format YYYYMMDD"""
931 date_str = date_str.replace(',', ' ')
932 # %z (UTC offset) is only supported in python>=3.2
933 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
934 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
935 # Remove AM/PM + timezone
936 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
938 format_expressions = [
951 '%Y-%m-%d %H:%M:%S.%f',
954 '%Y-%m-%dT%H:%M:%SZ',
955 '%Y-%m-%dT%H:%M:%S.%fZ',
956 '%Y-%m-%dT%H:%M:%S.%f0Z',
958 '%Y-%m-%dT%H:%M:%S.%f',
962 format_expressions.extend([
970 format_expressions.extend([
977 for expression in format_expressions:
979 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
982 if upload_date is None:
983 timetuple = email.utils.parsedate_tz(date_str)
985 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
986 if upload_date is not None:
987 return compat_str(upload_date)
990 def determine_ext(url, default_ext='unknown_video'):
993 guess = url.partition('?')[0].rpartition('.')[2]
994 if re.match(r'^[A-Za-z0-9]+$', guess):
996 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
997 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
998 return guess.rstrip('/')
1003 def subtitles_filename(filename, sub_lang, sub_format):
1004 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1007 def date_from_str(date_str):
1009 Return a datetime object from a string in the format YYYYMMDD or
1010 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1011 today = datetime.date.today()
1012 if date_str in ('now', 'today'):
1014 if date_str == 'yesterday':
1015 return today - datetime.timedelta(days=1)
1016 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1017 if match is not None:
1018 sign = match.group('sign')
1019 time = int(match.group('time'))
1022 unit = match.group('unit')
1023 # A bad approximation?
1027 elif unit == 'year':
1031 delta = datetime.timedelta(**{unit: time})
1032 return today + delta
1033 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1036 def hyphenate_date(date_str):
1038 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1039 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1040 if match is not None:
1041 return '-'.join(match.groups())
1046 class DateRange(object):
1047 """Represents a time interval between two dates"""
1049 def __init__(self, start=None, end=None):
1050 """start and end must be strings in the format accepted by date"""
1051 if start is not None:
1052 self.start = date_from_str(start)
1054 self.start = datetime.datetime.min.date()
1056 self.end = date_from_str(end)
1058 self.end = datetime.datetime.max.date()
1059 if self.start > self.end:
1060 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1064 """Returns a range that only contains the given day"""
1065 return cls(day, day)
1067 def __contains__(self, date):
1068 """Check if the date is in the range"""
1069 if not isinstance(date, datetime.date):
1070 date = date_from_str(date)
1071 return self.start <= date <= self.end
1074 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1077 def platform_name():
1078 """ Returns the platform name as a compat_str """
1079 res = platform.platform()
1080 if isinstance(res, bytes):
1081 res = res.decode(preferredencoding())
1083 assert isinstance(res, compat_str)
1087 def _windows_write_string(s, out):
1088 """ Returns True if the string was written using special methods,
1089 False if it has yet to be written out."""
1090 # Adapted from http://stackoverflow.com/a/3259271/35070
1093 import ctypes.wintypes
1101 fileno = out.fileno()
1102 except AttributeError:
1103 # If the output stream doesn't have a fileno, it's virtual
1105 except io.UnsupportedOperation:
1106 # Some strange Windows pseudo files?
1108 if fileno not in WIN_OUTPUT_IDS:
1111 GetStdHandle = ctypes.WINFUNCTYPE(
1112 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1113 (b'GetStdHandle', ctypes.windll.kernel32))
1114 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1116 WriteConsoleW = ctypes.WINFUNCTYPE(
1117 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1118 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1119 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1120 written = ctypes.wintypes.DWORD(0)
1122 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1123 FILE_TYPE_CHAR = 0x0002
1124 FILE_TYPE_REMOTE = 0x8000
1125 GetConsoleMode = ctypes.WINFUNCTYPE(
1126 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1127 ctypes.POINTER(ctypes.wintypes.DWORD))(
1128 (b'GetConsoleMode', ctypes.windll.kernel32))
1129 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1131 def not_a_console(handle):
1132 if handle == INVALID_HANDLE_VALUE or handle is None:
1134 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1135 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1137 if not_a_console(h):
1140 def next_nonbmp_pos(s):
1142 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1143 except StopIteration:
1147 count = min(next_nonbmp_pos(s), 1024)
1149 ret = WriteConsoleW(
1150 h, s, count if count else 2, ctypes.byref(written), None)
1152 raise OSError('Failed to write string')
1153 if not count: # We just wrote a non-BMP character
1154 assert written.value == 2
1157 assert written.value > 0
1158 s = s[written.value:]
1162 def write_string(s, out=None, encoding=None):
1165 assert type(s) == compat_str
1167 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1168 if _windows_write_string(s, out):
1171 if ('b' in getattr(out, 'mode', '') or
1172 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1173 byt = s.encode(encoding or preferredencoding(), 'ignore')
1175 elif hasattr(out, 'buffer'):
1176 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1177 byt = s.encode(enc, 'ignore')
1178 out.buffer.write(byt)
1184 def bytes_to_intlist(bs):
1187 if isinstance(bs[0], int): # Python 3
1190 return [ord(c) for c in bs]
1193 def intlist_to_bytes(xs):
1196 return struct_pack('%dB' % len(xs), *xs)
1199 # Cross-platform file locking
1200 if sys.platform == 'win32':
1201 import ctypes.wintypes
1204 class OVERLAPPED(ctypes.Structure):
1206 ('Internal', ctypes.wintypes.LPVOID),
1207 ('InternalHigh', ctypes.wintypes.LPVOID),
1208 ('Offset', ctypes.wintypes.DWORD),
1209 ('OffsetHigh', ctypes.wintypes.DWORD),
1210 ('hEvent', ctypes.wintypes.HANDLE),
1213 kernel32 = ctypes.windll.kernel32
1214 LockFileEx = kernel32.LockFileEx
1215 LockFileEx.argtypes = [
1216 ctypes.wintypes.HANDLE, # hFile
1217 ctypes.wintypes.DWORD, # dwFlags
1218 ctypes.wintypes.DWORD, # dwReserved
1219 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1220 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1221 ctypes.POINTER(OVERLAPPED) # Overlapped
1223 LockFileEx.restype = ctypes.wintypes.BOOL
1224 UnlockFileEx = kernel32.UnlockFileEx
1225 UnlockFileEx.argtypes = [
1226 ctypes.wintypes.HANDLE, # hFile
1227 ctypes.wintypes.DWORD, # dwReserved
1228 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1229 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1230 ctypes.POINTER(OVERLAPPED) # Overlapped
1232 UnlockFileEx.restype = ctypes.wintypes.BOOL
1233 whole_low = 0xffffffff
1234 whole_high = 0x7fffffff
1236 def _lock_file(f, exclusive):
1237 overlapped = OVERLAPPED()
1238 overlapped.Offset = 0
1239 overlapped.OffsetHigh = 0
1240 overlapped.hEvent = 0
1241 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1242 handle = msvcrt.get_osfhandle(f.fileno())
1243 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1244 whole_low, whole_high, f._lock_file_overlapped_p):
1245 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1247 def _unlock_file(f):
1248 assert f._lock_file_overlapped_p
1249 handle = msvcrt.get_osfhandle(f.fileno())
1250 if not UnlockFileEx(handle, 0,
1251 whole_low, whole_high, f._lock_file_overlapped_p):
1252 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1255 # Some platforms, such as Jython, is missing fcntl
1259 def _lock_file(f, exclusive):
1260 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1262 def _unlock_file(f):
1263 fcntl.flock(f, fcntl.LOCK_UN)
1265 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1267 def _lock_file(f, exclusive):
1268 raise IOError(UNSUPPORTED_MSG)
1270 def _unlock_file(f):
1271 raise IOError(UNSUPPORTED_MSG)
1274 class locked_file(object):
1275 def __init__(self, filename, mode, encoding=None):
1276 assert mode in ['r', 'a', 'w']
1277 self.f = io.open(filename, mode, encoding=encoding)
1280 def __enter__(self):
1281 exclusive = self.mode != 'r'
1283 _lock_file(self.f, exclusive)
1289 def __exit__(self, etype, value, traceback):
1291 _unlock_file(self.f)
1298 def write(self, *args):
1299 return self.f.write(*args)
1301 def read(self, *args):
1302 return self.f.read(*args)
1305 def get_filesystem_encoding():
1306 encoding = sys.getfilesystemencoding()
1307 return encoding if encoding is not None else 'utf-8'
1310 def shell_quote(args):
1312 encoding = get_filesystem_encoding()
1314 if isinstance(a, bytes):
1315 # We may get a filename encoded with 'encodeFilename'
1316 a = a.decode(encoding)
1317 quoted_args.append(pipes.quote(a))
1318 return ' '.join(quoted_args)
1321 def smuggle_url(url, data):
1322 """ Pass additional data in a URL for internal use. """
1324 sdata = compat_urllib_parse_urlencode(
1325 {'__youtubedl_smuggle': json.dumps(data)})
1326 return url + '#' + sdata
1329 def unsmuggle_url(smug_url, default=None):
1330 if '#__youtubedl_smuggle' not in smug_url:
1331 return smug_url, default
1332 url, _, sdata = smug_url.rpartition('#')
1333 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1334 data = json.loads(jsond)
1338 def format_bytes(bytes):
1341 if type(bytes) is str:
1342 bytes = float(bytes)
1346 exponent = int(math.log(bytes, 1024.0))
1347 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1348 converted = float(bytes) / float(1024 ** exponent)
1349 return '%.2f%s' % (converted, suffix)
1352 def lookup_unit_table(unit_table, s):
1353 units_re = '|'.join(re.escape(u) for u in unit_table)
1355 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1358 num_str = m.group('num').replace(',', '.')
1359 mult = unit_table[m.group('unit')]
1360 return int(float(num_str) * mult)
1363 def parse_filesize(s):
1367 # The lower-case forms are of course incorrect and unofficial,
1368 # but we support those too
1406 return lookup_unit_table(_UNIT_TABLE, s)
1415 if re.match(r'^[\d,.]+$', s):
1416 return str_to_int(s)
1427 return lookup_unit_table(_UNIT_TABLE, s)
1430 def month_by_name(name):
1431 """ Return the number of a month by (locale-independently) English name """
1434 return ENGLISH_MONTH_NAMES.index(name) + 1
1439 def month_by_abbreviation(abbrev):
1440 """ Return the number of a month by (locale-independently) English
1444 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1449 def fix_xml_ampersands(xml_str):
1450 """Replace all the '&' by '&' in XML"""
1452 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1457 def setproctitle(title):
1458 assert isinstance(title, compat_str)
1460 # ctypes in Jython is not complete
1461 # http://bugs.jython.org/issue2148
1462 if sys.platform.startswith('java'):
1466 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1469 title_bytes = title.encode('utf-8')
1470 buf = ctypes.create_string_buffer(len(title_bytes))
1471 buf.value = title_bytes
1473 libc.prctl(15, buf, 0, 0, 0)
1474 except AttributeError:
1475 return # Strange libc, just skip this
1478 def remove_start(s, start):
1479 if s.startswith(start):
1480 return s[len(start):]
1484 def remove_end(s, end):
1486 return s[:-len(end)]
1490 def remove_quotes(s):
1491 if s is None or len(s) < 2:
1493 for quote in ('"', "'", ):
1494 if s[0] == quote and s[-1] == quote:
1499 def url_basename(url):
1500 path = compat_urlparse.urlparse(url).path
1501 return path.strip('/').split('/')[-1]
1504 class HEADRequest(compat_urllib_request.Request):
1505 def get_method(self):
1509 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1512 v = getattr(v, get_attr, None)
1518 return int(v) * invscale // scale
1523 def str_or_none(v, default=None):
1524 return default if v is None else compat_str(v)
1527 def str_to_int(int_str):
1528 """ A more relaxed version of int_or_none """
1531 int_str = re.sub(r'[,\.\+]', '', int_str)
1535 def float_or_none(v, scale=1, invscale=1, default=None):
1539 return float(v) * invscale / scale
1544 def parse_duration(s):
1545 if not isinstance(s, compat_basestring):
1550 days, hours, mins, secs, ms = [None] * 5
1551 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1553 days, hours, mins, secs, ms = m.groups()
1558 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1561 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1564 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1567 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1570 days, hours, mins, secs, ms = m.groups()
1572 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1574 hours, mins = m.groups()
1580 duration += float(secs)
1582 duration += float(mins) * 60
1584 duration += float(hours) * 60 * 60
1586 duration += float(days) * 24 * 60 * 60
1588 duration += float(ms)
1592 def prepend_extension(filename, ext, expected_real_ext=None):
1593 name, real_ext = os.path.splitext(filename)
1595 '{0}.{1}{2}'.format(name, ext, real_ext)
1596 if not expected_real_ext or real_ext[1:] == expected_real_ext
1597 else '{0}.{1}'.format(filename, ext))
1600 def replace_extension(filename, ext, expected_real_ext=None):
1601 name, real_ext = os.path.splitext(filename)
1602 return '{0}.{1}'.format(
1603 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1607 def check_executable(exe, args=[]):
1608 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1609 args can be a list of arguments for a short output (like -version) """
1611 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1617 def get_exe_version(exe, args=['--version'],
1618 version_re=None, unrecognized='present'):
1619 """ Returns the version of the specified executable,
1620 or False if the executable is not present """
1622 out, _ = subprocess.Popen(
1623 [encodeArgument(exe)] + args,
1624 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1627 if isinstance(out, bytes): # Python 2.x
1628 out = out.decode('ascii', 'ignore')
1629 return detect_exe_version(out, version_re, unrecognized)
1632 def detect_exe_version(output, version_re=None, unrecognized='present'):
1633 assert isinstance(output, compat_str)
1634 if version_re is None:
1635 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1636 m = re.search(version_re, output)
1643 class PagedList(object):
1645 # This is only useful for tests
1646 return len(self.getslice())
1649 class OnDemandPagedList(PagedList):
1650 def __init__(self, pagefunc, pagesize, use_cache=False):
1651 self._pagefunc = pagefunc
1652 self._pagesize = pagesize
1653 self._use_cache = use_cache
1657 def getslice(self, start=0, end=None):
1659 for pagenum in itertools.count(start // self._pagesize):
1660 firstid = pagenum * self._pagesize
1661 nextfirstid = pagenum * self._pagesize + self._pagesize
1662 if start >= nextfirstid:
1667 page_results = self._cache.get(pagenum)
1668 if page_results is None:
1669 page_results = list(self._pagefunc(pagenum))
1671 self._cache[pagenum] = page_results
1674 start % self._pagesize
1675 if firstid <= start < nextfirstid
1679 ((end - 1) % self._pagesize) + 1
1680 if (end is not None and firstid <= end <= nextfirstid)
1683 if startv != 0 or endv is not None:
1684 page_results = page_results[startv:endv]
1685 res.extend(page_results)
1687 # A little optimization - if current page is not "full", ie. does
1688 # not contain page_size videos then we can assume that this page
1689 # is the last one - there are no more ids on further pages -
1690 # i.e. no need to query again.
1691 if len(page_results) + startv < self._pagesize:
1694 # If we got the whole page, but the next page is not interesting,
1695 # break out early as well
1696 if end == nextfirstid:
1701 class InAdvancePagedList(PagedList):
1702 def __init__(self, pagefunc, pagecount, pagesize):
1703 self._pagefunc = pagefunc
1704 self._pagecount = pagecount
1705 self._pagesize = pagesize
1707 def getslice(self, start=0, end=None):
1709 start_page = start // self._pagesize
1711 self._pagecount if end is None else (end // self._pagesize + 1))
1712 skip_elems = start - start_page * self._pagesize
1713 only_more = None if end is None else end - start
1714 for pagenum in range(start_page, end_page):
1715 page = list(self._pagefunc(pagenum))
1717 page = page[skip_elems:]
1719 if only_more is not None:
1720 if len(page) < only_more:
1721 only_more -= len(page)
1723 page = page[:only_more]
1730 def uppercase_escape(s):
1731 unicode_escape = codecs.getdecoder('unicode_escape')
1733 r'\\U[0-9a-fA-F]{8}',
1734 lambda m: unicode_escape(m.group(0))[0],
1738 def lowercase_escape(s):
1739 unicode_escape = codecs.getdecoder('unicode_escape')
1741 r'\\u[0-9a-fA-F]{4}',
1742 lambda m: unicode_escape(m.group(0))[0],
1746 def escape_rfc3986(s):
1747 """Escape non-ASCII characters as suggested by RFC 3986"""
1748 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1749 s = s.encode('utf-8')
1750 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1753 def escape_url(url):
1754 """Escape URL as suggested by RFC 3986"""
1755 url_parsed = compat_urllib_parse_urlparse(url)
1756 return url_parsed._replace(
1757 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1758 path=escape_rfc3986(url_parsed.path),
1759 params=escape_rfc3986(url_parsed.params),
1760 query=escape_rfc3986(url_parsed.query),
1761 fragment=escape_rfc3986(url_parsed.fragment)
1765 def read_batch_urls(batch_fd):
1767 if not isinstance(url, compat_str):
1768 url = url.decode('utf-8', 'replace')
1769 BOM_UTF8 = '\xef\xbb\xbf'
1770 if url.startswith(BOM_UTF8):
1771 url = url[len(BOM_UTF8):]
1773 if url.startswith(('#', ';', ']')):
1777 with contextlib.closing(batch_fd) as fd:
1778 return [url for url in map(fixup, fd) if url]
1781 def urlencode_postdata(*args, **kargs):
1782 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1785 def update_url_query(url, query):
1788 parsed_url = compat_urlparse.urlparse(url)
1789 qs = compat_parse_qs(parsed_url.query)
1791 return compat_urlparse.urlunparse(parsed_url._replace(
1792 query=compat_urllib_parse_urlencode(qs, True)))
1795 def update_Request(req, url=None, data=None, headers={}, query={}):
1796 req_headers = req.headers.copy()
1797 req_headers.update(headers)
1798 req_data = data or req.data
1799 req_url = update_url_query(url or req.get_full_url(), query)
1800 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1802 req_url, data=req_data, headers=req_headers,
1803 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1804 if hasattr(req, 'timeout'):
1805 new_req.timeout = req.timeout
1809 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1810 if isinstance(key_or_keys, (list, tuple)):
1811 for key in key_or_keys:
1812 if key not in d or d[key] is None or skip_false_values and not d[key]:
1816 return d.get(key_or_keys, default)
1819 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1820 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1832 def parse_age_limit(s):
1835 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1836 return int(m.group('age')) if m else US_RATINGS.get(s)
1839 def strip_jsonp(code):
1841 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1844 def js_to_json(code):
1847 if v in ('true', 'false', 'null'):
1849 if v.startswith('"'):
1850 v = re.sub(r"\\'", "'", v[1:-1])
1851 elif v.startswith("'"):
1853 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1860 res = re.sub(r'''(?x)
1861 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1862 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1863 [a-zA-Z_][.a-zA-Z_0-9]*
1865 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1869 def qualities(quality_ids):
1870 """ Get a numeric quality value out of a list of possible values """
1873 return quality_ids.index(qid)
1879 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1882 def limit_length(s, length):
1883 """ Add ellipses to overly long strings """
1888 return s[:length - len(ELLIPSES)] + ELLIPSES
1892 def version_tuple(v):
1893 return tuple(int(e) for e in re.split(r'[-.]', v))
1896 def is_outdated_version(version, limit, assume_new=True):
1898 return not assume_new
1900 return version_tuple(version) < version_tuple(limit)
1902 return not assume_new
1905 def ytdl_is_updateable():
1906 """ Returns if youtube-dl can be updated with -U """
1907 from zipimport import zipimporter
1909 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1912 def args_to_str(args):
1913 # Get a short string representation for a subprocess command
1914 return ' '.join(shlex_quote(a) for a in args)
1917 def error_to_compat_str(err):
1919 # On python 2 error byte string must be decoded with proper
1920 # encoding rather than ascii
1921 if sys.version_info[0] < 3:
1922 err_str = err_str.decode(preferredencoding())
1926 def mimetype2ext(mt):
1936 _, _, res = mt.rpartition('/')
1940 'smptett+xml': 'tt',
1946 'x-mp4-fragmented': 'mp4',
1951 def urlhandle_detect_ext(url_handle):
1954 getheader = lambda h: url_handle.headers[h]
1955 except AttributeError: # Python < 3
1956 getheader = url_handle.info().getheader
1958 cd = getheader('Content-Disposition')
1960 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1962 e = determine_ext(m.group('filename'), default_ext=None)
1966 return mimetype2ext(getheader('Content-Type'))
1969 def encode_data_uri(data, mime_type):
1970 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1973 def age_restricted(content_limit, age_limit):
1974 """ Returns True iff the content should be blocked """
1976 if age_limit is None: # No limit set
1978 if content_limit is None:
1979 return False # Content available for everyone
1980 return age_limit < content_limit
1983 def is_html(first_bytes):
1984 """ Detect whether a file contains HTML by examining its first bytes. """
1987 (b'\xef\xbb\xbf', 'utf-8'),
1988 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1989 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1990 (b'\xff\xfe', 'utf-16-le'),
1991 (b'\xfe\xff', 'utf-16-be'),
1993 for bom, enc in BOMS:
1994 if first_bytes.startswith(bom):
1995 s = first_bytes[len(bom):].decode(enc, 'replace')
1998 s = first_bytes.decode('utf-8', 'replace')
2000 return re.match(r'^\s*<', s)
2003 def determine_protocol(info_dict):
2004 protocol = info_dict.get('protocol')
2005 if protocol is not None:
2008 url = info_dict['url']
2009 if url.startswith('rtmp'):
2011 elif url.startswith('mms'):
2013 elif url.startswith('rtsp'):
2016 ext = determine_ext(url)
2022 return compat_urllib_parse_urlparse(url).scheme
2025 def render_table(header_row, data):
2026 """ Render a list of rows, each as a list of values """
2027 table = [header_row] + data
2028 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2029 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2030 return '\n'.join(format_str % tuple(row) for row in table)
2033 def _match_one(filter_part, dct):
2034 COMPARISON_OPERATORS = {
2042 operator_rex = re.compile(r'''(?x)\s*
2044 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2046 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2047 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2050 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2051 m = operator_rex.search(filter_part)
2053 op = COMPARISON_OPERATORS[m.group('op')]
2054 if m.group('strval') is not None:
2055 if m.group('op') not in ('=', '!='):
2057 'Operator %s does not support string values!' % m.group('op'))
2058 comparison_value = m.group('strval')
2061 comparison_value = int(m.group('intval'))
2063 comparison_value = parse_filesize(m.group('intval'))
2064 if comparison_value is None:
2065 comparison_value = parse_filesize(m.group('intval') + 'B')
2066 if comparison_value is None:
2068 'Invalid integer value %r in filter part %r' % (
2069 m.group('intval'), filter_part))
2070 actual_value = dct.get(m.group('key'))
2071 if actual_value is None:
2072 return m.group('none_inclusive')
2073 return op(actual_value, comparison_value)
2076 '': lambda v: v is not None,
2077 '!': lambda v: v is None,
2079 operator_rex = re.compile(r'''(?x)\s*
2080 (?P<op>%s)\s*(?P<key>[a-z_]+)
2082 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2083 m = operator_rex.search(filter_part)
2085 op = UNARY_OPERATORS[m.group('op')]
2086 actual_value = dct.get(m.group('key'))
2087 return op(actual_value)
2089 raise ValueError('Invalid filter part %r' % filter_part)
2092 def match_str(filter_str, dct):
2093 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2096 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2099 def match_filter_func(filter_str):
2100 def _match_func(info_dict):
2101 if match_str(filter_str, info_dict):
2104 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2105 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2109 def parse_dfxp_time_expr(time_expr):
2113 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2115 return float(mobj.group('time_offset'))
2117 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2119 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2122 def srt_subtitles_timecode(seconds):
2123 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2126 def dfxp2srt(dfxp_data):
2127 _x = functools.partial(xpath_with_ns, ns_map={
2128 'ttml': 'http://www.w3.org/ns/ttml',
2129 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2130 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2133 class TTMLPElementParser(object):
2136 def start(self, tag, attrib):
2137 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2143 def data(self, data):
2147 return self.out.strip()
2149 def parse_node(node):
2150 target = TTMLPElementParser()
2151 parser = xml.etree.ElementTree.XMLParser(target=target)
2152 parser.feed(xml.etree.ElementTree.tostring(node))
2153 return parser.close()
2155 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2157 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2160 raise ValueError('Invalid dfxp/TTML subtitle')
2162 for para, index in zip(paras, itertools.count(1)):
2163 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2164 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2165 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2166 if begin_time is None:
2171 end_time = begin_time + dur
2172 out.append('%d\n%s --> %s\n%s\n\n' % (
2174 srt_subtitles_timecode(begin_time),
2175 srt_subtitles_timecode(end_time),
2181 def cli_option(params, command_option, param):
2182 param = params.get(param)
2183 return [command_option, param] if param is not None else []
2186 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2187 param = params.get(param)
2188 assert isinstance(param, bool)
2190 return [command_option + separator + (true_value if param else false_value)]
2191 return [command_option, true_value if param else false_value]
2194 def cli_valueless_option(params, command_option, param, expected_value=True):
2195 param = params.get(param)
2196 return [command_option] if param == expected_value else []
2199 def cli_configuration_args(params, param, default=[]):
2200 ex_args = params.get(param)
2203 assert isinstance(ex_args, list)
2207 class ISO639Utils(object):
2208 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2397 def short2long(cls, code):
2398 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2399 return cls._lang_map.get(code[:2])
2402 def long2short(cls, code):
2403 """Convert language code from ISO 639-2/T to ISO 639-1"""
2404 for short_name, long_name in cls._lang_map.items():
2405 if long_name == code:
2409 class ISO3166Utils(object):
2410 # From http://data.okfn.org/data/core/country-list
2412 'AF': 'Afghanistan',
2413 'AX': 'Åland Islands',
2416 'AS': 'American Samoa',
2421 'AG': 'Antigua and Barbuda',
2438 'BO': 'Bolivia, Plurinational State of',
2439 'BQ': 'Bonaire, Sint Eustatius and Saba',
2440 'BA': 'Bosnia and Herzegovina',
2442 'BV': 'Bouvet Island',
2444 'IO': 'British Indian Ocean Territory',
2445 'BN': 'Brunei Darussalam',
2447 'BF': 'Burkina Faso',
2453 'KY': 'Cayman Islands',
2454 'CF': 'Central African Republic',
2458 'CX': 'Christmas Island',
2459 'CC': 'Cocos (Keeling) Islands',
2463 'CD': 'Congo, the Democratic Republic of the',
2464 'CK': 'Cook Islands',
2466 'CI': 'Côte d\'Ivoire',
2471 'CZ': 'Czech Republic',
2475 'DO': 'Dominican Republic',
2478 'SV': 'El Salvador',
2479 'GQ': 'Equatorial Guinea',
2483 'FK': 'Falkland Islands (Malvinas)',
2484 'FO': 'Faroe Islands',
2488 'GF': 'French Guiana',
2489 'PF': 'French Polynesia',
2490 'TF': 'French Southern Territories',
2505 'GW': 'Guinea-Bissau',
2508 'HM': 'Heard Island and McDonald Islands',
2509 'VA': 'Holy See (Vatican City State)',
2516 'IR': 'Iran, Islamic Republic of',
2519 'IM': 'Isle of Man',
2529 'KP': 'Korea, Democratic People\'s Republic of',
2530 'KR': 'Korea, Republic of',
2533 'LA': 'Lao People\'s Democratic Republic',
2539 'LI': 'Liechtenstein',
2543 'MK': 'Macedonia, the Former Yugoslav Republic of',
2550 'MH': 'Marshall Islands',
2556 'FM': 'Micronesia, Federated States of',
2557 'MD': 'Moldova, Republic of',
2568 'NL': 'Netherlands',
2569 'NC': 'New Caledonia',
2570 'NZ': 'New Zealand',
2575 'NF': 'Norfolk Island',
2576 'MP': 'Northern Mariana Islands',
2581 'PS': 'Palestine, State of',
2583 'PG': 'Papua New Guinea',
2586 'PH': 'Philippines',
2590 'PR': 'Puerto Rico',
2594 'RU': 'Russian Federation',
2596 'BL': 'Saint Barthélemy',
2597 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2598 'KN': 'Saint Kitts and Nevis',
2599 'LC': 'Saint Lucia',
2600 'MF': 'Saint Martin (French part)',
2601 'PM': 'Saint Pierre and Miquelon',
2602 'VC': 'Saint Vincent and the Grenadines',
2605 'ST': 'Sao Tome and Principe',
2606 'SA': 'Saudi Arabia',
2610 'SL': 'Sierra Leone',
2612 'SX': 'Sint Maarten (Dutch part)',
2615 'SB': 'Solomon Islands',
2617 'ZA': 'South Africa',
2618 'GS': 'South Georgia and the South Sandwich Islands',
2619 'SS': 'South Sudan',
2624 'SJ': 'Svalbard and Jan Mayen',
2627 'CH': 'Switzerland',
2628 'SY': 'Syrian Arab Republic',
2629 'TW': 'Taiwan, Province of China',
2631 'TZ': 'Tanzania, United Republic of',
2633 'TL': 'Timor-Leste',
2637 'TT': 'Trinidad and Tobago',
2640 'TM': 'Turkmenistan',
2641 'TC': 'Turks and Caicos Islands',
2645 'AE': 'United Arab Emirates',
2646 'GB': 'United Kingdom',
2647 'US': 'United States',
2648 'UM': 'United States Minor Outlying Islands',
2652 'VE': 'Venezuela, Bolivarian Republic of',
2654 'VG': 'Virgin Islands, British',
2655 'VI': 'Virgin Islands, U.S.',
2656 'WF': 'Wallis and Futuna',
2657 'EH': 'Western Sahara',
2664 def short2full(cls, code):
2665 """Convert an ISO 3166-2 country code to the corresponding full name"""
2666 return cls._country_map.get(code.upper())
2669 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2670 def __init__(self, proxies=None):
2671 # Set default handlers
2672 for type in ('http', 'https'):
2673 setattr(self, '%s_open' % type,
2674 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2675 meth(r, proxy, type))
2676 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2678 def proxy_open(self, req, proxy, type):
2679 req_proxy = req.headers.get('Ytdl-request-proxy')
2680 if req_proxy is not None:
2682 del req.headers['Ytdl-request-proxy']
2684 if proxy == '__noproxy__':
2685 return None # No Proxy
2686 return compat_urllib_request.ProxyHandler.proxy_open(
2687 self, req, proxy, type)
2690 def ohdave_rsa_encrypt(data, exponent, modulus):
2692 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2695 data: data to encrypt, bytes-like object
2696 exponent, modulus: parameter e and N of RSA algorithm, both integer
2697 Output: hex string of encrypted data
2699 Limitation: supports one block encryption only
2702 payload = int(binascii.hexlify(data[::-1]), 16)
2703 encrypted = pow(payload, exponent, modulus)
2704 return '%x' % encrypted
2707 def encode_base_n(num, n, table=None):
2708 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2710 table = FULL_TABLE[:n]
2713 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2720 ret = table[num % n] + ret
2725 def decode_packed_codes(code):
2727 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2729 obfucasted_code, base, count, symbols = mobj.groups()
2732 symbols = symbols.split('|')
2737 base_n_count = encode_base_n(count, base)
2738 symbol_table[base_n_count] = symbols[count] or base_n_count
2741 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],