2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
41 compat_etree_fromstring,
46 compat_socket_create_connection,
50 compat_urllib_parse_urlparse,
51 compat_urllib_request,
57 # This is not clearly defined otherwise
58 compiled_regex_type = type(re.compile(''))
61 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
62 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
63 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
64 'Accept-Encoding': 'gzip, deflate',
65 'Accept-Language': 'en-us,en;q=0.5',
71 ENGLISH_MONTH_NAMES = [
72 'January', 'February', 'March', 'April', 'May', 'June',
73 'July', 'August', 'September', 'October', 'November', 'December']
76 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
77 'flv', 'f4v', 'f4a', 'f4b',
78 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
88 'f4f', 'f4m', 'm3u8', 'smil')
91 def preferredencoding():
92 """Get preferred encoding.
94 Returns the best encoding scheme for the system, based on
95 locale.getpreferredencoding() and some further tweaks.
98 pref = locale.getpreferredencoding()
106 def write_json_file(obj, fn):
107 """ Encode obj as JSON and write it to fn, atomically if possible """
109 fn = encodeFilename(fn)
110 if sys.version_info < (3, 0) and sys.platform != 'win32':
111 encoding = get_filesystem_encoding()
112 # os.path.basename returns a bytes object, but NamedTemporaryFile
113 # will fail if the filename contains non ascii characters unless we
114 # use a unicode object
115 path_basename = lambda f: os.path.basename(fn).decode(encoding)
116 # the same for os.path.dirname
117 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
119 path_basename = os.path.basename
120 path_dirname = os.path.dirname
124 'prefix': path_basename(fn) + '.',
125 'dir': path_dirname(fn),
129 # In Python 2.x, json.dump expects a bytestream.
130 # In Python 3.x, it writes to a character stream
131 if sys.version_info < (3, 0):
139 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
144 if sys.platform == 'win32':
145 # Need to remove existing file on Windows, else os.rename raises
146 # WindowsError or FileExistsError.
151 os.rename(tf.name, fn)
160 if sys.version_info >= (2, 7):
161 def find_xpath_attr(node, xpath, key, val=None):
162 """ Find the xpath xpath[@key=val] """
163 assert re.match(r'^[a-zA-Z_-]+$', key)
164 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
165 return node.find(expr)
167 def find_xpath_attr(node, xpath, key, val=None):
168 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
169 # .//node does not match if a node is a direct child of . !
170 if isinstance(xpath, compat_str):
171 xpath = xpath.encode('ascii')
173 for f in node.findall(xpath):
174 if key not in f.attrib:
176 if val is None or f.attrib.get(key) == val:
180 # On python2.6 the xml.etree.ElementTree.Element methods don't support
181 # the namespace parameter
184 def xpath_with_ns(path, ns_map):
185 components = [c.split(':') for c in path.split('/')]
189 replaced.append(c[0])
192 replaced.append('{%s}%s' % (ns_map[ns], tag))
193 return '/'.join(replaced)
196 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
197 def _find_xpath(xpath):
198 if sys.version_info < (2, 7): # Crazy 2.6
199 xpath = xpath.encode('ascii')
200 return node.find(xpath)
202 if isinstance(xpath, (str, compat_str)):
203 n = _find_xpath(xpath)
211 if default is not NO_DEFAULT:
214 name = xpath if name is None else name
215 raise ExtractorError('Could not find XML element %s' % name)
221 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
222 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
223 if n is None or n == default:
226 if default is not NO_DEFAULT:
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element\'s text %s' % name)
236 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
237 n = find_xpath_attr(node, xpath, key)
239 if default is not NO_DEFAULT:
242 name = '%s[@%s]' % (xpath, key) if name is None else name
243 raise ExtractorError('Could not find XML attribute %s' % name)
249 def get_element_by_id(id, html):
250 """Return the content of the tag with the specified ID in the passed HTML document"""
251 return get_element_by_attribute('id', id, html)
254 def get_element_by_attribute(attribute, value, html):
255 """Return the content of the tag with the specified attribute in the passed HTML document"""
257 m = re.search(r'''(?xs)
259 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
261 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
265 ''' % (re.escape(attribute), re.escape(value)), html)
269 res = m.group('content')
271 if res.startswith('"') or res.startswith("'"):
274 return unescapeHTML(res)
276 class HTMLAttributeParser(compat_HTMLParser):
277 """Trivial HTML parser to gather the attributes for a single element"""
280 compat_HTMLParser.__init__(self)
282 def handle_starttag(self, tag, attrs):
283 self.attrs = dict(attrs)
285 def extract_attributes(html_element):
286 """Given a string for an HTML element such as
288 a="foo" B="bar" c="&98;az" d=boz
289 empty= noval entity="&"
292 Decode and return a dictionary of attributes.
294 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
295 'empty': '', 'noval': None, 'entity': '&',
296 'sq': '"', 'dq': '\''
298 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
299 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
301 parser = HTMLAttributeParser()
302 parser.feed(html_element)
306 def clean_html(html):
307 """Clean an HTML snippet into a readable string"""
309 if html is None: # Convenience for sanitizing descriptions etc.
313 html = html.replace('\n', ' ')
314 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
315 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
317 html = re.sub('<.*?>', '', html)
318 # Replace html entities
319 html = unescapeHTML(html)
323 def sanitize_open(filename, open_mode):
324 """Try to open the given filename, and slightly tweak it if this fails.
326 Attempts to open the given filename. If this fails, it tries to change
327 the filename slightly, step by step, until it's either able to open it
328 or it fails and raises a final exception, like the standard open()
331 It returns the tuple (stream, definitive_file_name).
335 if sys.platform == 'win32':
337 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
338 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
339 stream = open(encodeFilename(filename), open_mode)
340 return (stream, filename)
341 except (IOError, OSError) as err:
342 if err.errno in (errno.EACCES,):
345 # In case of error, try to remove win32 forbidden chars
346 alt_filename = sanitize_path(filename)
347 if alt_filename == filename:
350 # An exception here should be caught in the caller
351 stream = open(encodeFilename(alt_filename), open_mode)
352 return (stream, alt_filename)
355 def timeconvert(timestr):
356 """Convert RFC 2822 defined time string into system timestamp"""
358 timetuple = email.utils.parsedate_tz(timestr)
359 if timetuple is not None:
360 timestamp = email.utils.mktime_tz(timetuple)
364 def sanitize_filename(s, restricted=False, is_id=False):
365 """Sanitizes a string so it could be used as part of a filename.
366 If restricted is set, use a stricter subset of allowed characters.
367 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
369 def replace_insane(char):
370 if char == '?' or ord(char) < 32 or ord(char) == 127:
373 return '' if restricted else '\''
375 return '_-' if restricted else ' -'
376 elif char in '\\/|*<>':
378 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
380 if restricted and ord(char) > 127:
385 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
386 result = ''.join(map(replace_insane, s))
388 while '__' in result:
389 result = result.replace('__', '_')
390 result = result.strip('_')
391 # Common case of "Foreign band name - English song title"
392 if restricted and result.startswith('-_'):
394 if result.startswith('-'):
395 result = '_' + result[len('-'):]
396 result = result.lstrip('.')
402 def sanitize_path(s):
403 """Sanitizes and normalizes path on Windows"""
404 if sys.platform != 'win32':
406 drive_or_unc, _ = os.path.splitdrive(s)
407 if sys.version_info < (2, 7) and not drive_or_unc:
408 drive_or_unc, _ = os.path.splitunc(s)
409 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
413 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
414 for path_part in norm_path]
416 sanitized_path.insert(0, drive_or_unc + os.path.sep)
417 return os.path.join(*sanitized_path)
420 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
421 # unwanted failures due to missing protocol
422 def sanitized_Request(url, *args, **kwargs):
423 return compat_urllib_request.Request(
424 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
427 def orderedSet(iterable):
428 """ Remove all duplicates from the input iterable """
436 def _htmlentity_transform(entity):
437 """Transforms an HTML entity to a character."""
438 # Known non-numeric HTML entity
439 if entity in compat_html_entities.name2codepoint:
440 return compat_chr(compat_html_entities.name2codepoint[entity])
442 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
444 numstr = mobj.group(1)
445 if numstr.startswith('x'):
447 numstr = '0%s' % numstr
450 # See https://github.com/rg3/youtube-dl/issues/7518
452 return compat_chr(int(numstr, base))
456 # Unknown entity in name, return its literal representation
457 return '&%s;' % entity
463 assert type(s) == compat_str
466 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
469 def get_subprocess_encoding():
470 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
471 # For subprocess calls, encode with locale encoding
472 # Refer to http://stackoverflow.com/a/9951851/35070
473 encoding = preferredencoding()
475 encoding = sys.getfilesystemencoding()
481 def encodeFilename(s, for_subprocess=False):
483 @param s The name of the file
486 assert type(s) == compat_str
488 # Python 3 has a Unicode API
489 if sys.version_info >= (3, 0):
492 # Pass '' directly to use Unicode APIs on Windows 2000 and up
493 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
494 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
495 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
498 return s.encode(get_subprocess_encoding(), 'ignore')
501 def decodeFilename(b, for_subprocess=False):
503 if sys.version_info >= (3, 0):
506 if not isinstance(b, bytes):
509 return b.decode(get_subprocess_encoding(), 'ignore')
512 def encodeArgument(s):
513 if not isinstance(s, compat_str):
514 # Legacy code that uses byte strings
515 # Uncomment the following line after fixing all post processors
516 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
517 s = s.decode('ascii')
518 return encodeFilename(s, True)
521 def decodeArgument(b):
522 return decodeFilename(b, True)
525 def decodeOption(optval):
528 if isinstance(optval, bytes):
529 optval = optval.decode(preferredencoding())
531 assert isinstance(optval, compat_str)
535 def formatSeconds(secs):
537 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
539 return '%d:%02d' % (secs // 60, secs % 60)
544 def make_HTTPS_handler(params, **kwargs):
545 opts_no_check_certificate = params.get('nocheckcertificate', False)
546 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
547 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
548 if opts_no_check_certificate:
549 context.check_hostname = False
550 context.verify_mode = ssl.CERT_NONE
552 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
555 # (create_default_context present but HTTPSHandler has no context=)
558 if sys.version_info < (3, 2):
559 return YoutubeDLHTTPSHandler(params, **kwargs)
561 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
562 context.verify_mode = (ssl.CERT_NONE
563 if opts_no_check_certificate
564 else ssl.CERT_REQUIRED)
565 context.set_default_verify_paths()
566 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
569 def bug_reports_message():
570 if ytdl_is_updateable():
571 update_cmd = 'type youtube-dl -U to update'
573 update_cmd = 'see https://yt-dl.org/update on how to update'
574 msg = '; please report this issue on https://yt-dl.org/bug .'
575 msg += ' Make sure you are using the latest version; %s.' % update_cmd
576 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
580 class ExtractorError(Exception):
581 """Error during info extraction."""
583 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
584 """ tb, if given, is the original traceback (so that it can be printed out).
585 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
588 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
590 if video_id is not None:
591 msg = video_id + ': ' + msg
593 msg += ' (caused by %r)' % cause
595 msg += bug_reports_message()
596 super(ExtractorError, self).__init__(msg)
599 self.exc_info = sys.exc_info() # preserve original exception
601 self.video_id = video_id
603 def format_traceback(self):
604 if self.traceback is None:
606 return ''.join(traceback.format_tb(self.traceback))
609 class UnsupportedError(ExtractorError):
610 def __init__(self, url):
611 super(UnsupportedError, self).__init__(
612 'Unsupported URL: %s' % url, expected=True)
616 class RegexNotFoundError(ExtractorError):
617 """Error when a regex didn't match"""
621 class DownloadError(Exception):
622 """Download Error exception.
624 This exception may be thrown by FileDownloader objects if they are not
625 configured to continue on errors. They will contain the appropriate
629 def __init__(self, msg, exc_info=None):
630 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
631 super(DownloadError, self).__init__(msg)
632 self.exc_info = exc_info
635 class SameFileError(Exception):
636 """Same File exception.
638 This exception will be thrown by FileDownloader objects if they detect
639 multiple files would have to be downloaded to the same file on disk.
644 class PostProcessingError(Exception):
645 """Post Processing exception.
647 This exception may be raised by PostProcessor's .run() method to
648 indicate an error in the postprocessing task.
651 def __init__(self, msg):
655 class MaxDownloadsReached(Exception):
656 """ --max-downloads limit has been reached. """
660 class UnavailableVideoError(Exception):
661 """Unavailable Format exception.
663 This exception will be thrown when a video is requested
664 in a format that is not available for that video.
669 class ContentTooShortError(Exception):
670 """Content Too Short exception.
672 This exception may be raised by FileDownloader objects when a file they
673 download is too small for what the server announced first, indicating
674 the connection was probably interrupted.
677 def __init__(self, downloaded, expected):
679 self.downloaded = downloaded
680 self.expected = expected
683 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
684 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
685 # expected HTTP responses to meet HTTP/1.0 or later (see also
686 # https://github.com/rg3/youtube-dl/issues/6727)
687 if sys.version_info < (3, 0):
688 kwargs[b'strict'] = True
689 hc = http_class(*args, **kwargs)
690 source_address = ydl_handler._params.get('source_address')
691 if source_address is not None:
692 sa = (source_address, 0)
693 if hasattr(hc, 'source_address'): # Python 2.7+
694 hc.source_address = sa
696 def _hc_connect(self, *args, **kwargs):
697 sock = compat_socket_create_connection(
698 (self.host, self.port), self.timeout, sa)
700 self.sock = ssl.wrap_socket(
701 sock, self.key_file, self.cert_file,
702 ssl_version=ssl.PROTOCOL_TLSv1)
705 hc.connect = functools.partial(_hc_connect, hc)
710 def handle_youtubedl_headers(headers):
711 filtered_headers = headers
713 if 'Youtubedl-no-compression' in filtered_headers:
714 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
715 del filtered_headers['Youtubedl-no-compression']
717 return filtered_headers
720 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
721 """Handler for HTTP requests and responses.
723 This class, when installed with an OpenerDirector, automatically adds
724 the standard headers to every HTTP request and handles gzipped and
725 deflated responses from web servers. If compression is to be avoided in
726 a particular request, the original request in the program code only has
727 to include the HTTP header "Youtubedl-no-compression", which will be
728 removed before making the real request.
730 Part of this code was copied from:
732 http://techknack.net/python-urllib2-handlers/
734 Andrew Rowls, the author of that code, agreed to release it to the
738 def __init__(self, params, *args, **kwargs):
739 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
740 self._params = params
742 def http_open(self, req):
743 return self.do_open(functools.partial(
744 _create_http_connection, self, compat_http_client.HTTPConnection, False),
750 return zlib.decompress(data, -zlib.MAX_WBITS)
752 return zlib.decompress(data)
755 def addinfourl_wrapper(stream, headers, url, code):
756 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
757 return compat_urllib_request.addinfourl(stream, headers, url, code)
758 ret = compat_urllib_request.addinfourl(stream, headers, url)
762 def http_request(self, req):
763 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
764 # always respected by websites, some tend to give out URLs with non percent-encoded
765 # non-ASCII characters (see telemb.py, ard.py [#3412])
766 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
767 # To work around aforementioned issue we will replace request's original URL with
768 # percent-encoded one
769 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
770 # the code of this workaround has been moved here from YoutubeDL.urlopen()
771 url = req.get_full_url()
772 url_escaped = escape_url(url)
774 # Substitute URL if any change after escaping
775 if url != url_escaped:
776 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
778 url_escaped, data=req.data, headers=req.headers,
779 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
780 new_req.timeout = req.timeout
783 for h, v in std_headers.items():
784 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
785 # The dict keys are capitalized because of this bug by urllib
786 if h.capitalize() not in req.headers:
789 req.headers = handle_youtubedl_headers(req.headers)
791 if sys.version_info < (2, 7) and '#' in req.get_full_url():
792 # Python 2.6 is brain-dead when it comes to fragments
793 req._Request__original = req._Request__original.partition('#')[0]
794 req._Request__r_type = req._Request__r_type.partition('#')[0]
798 def http_response(self, req, resp):
801 if resp.headers.get('Content-encoding', '') == 'gzip':
802 content = resp.read()
803 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
805 uncompressed = io.BytesIO(gz.read())
806 except IOError as original_ioerror:
807 # There may be junk add the end of the file
808 # See http://stackoverflow.com/q/4928560/35070 for details
809 for i in range(1, 1024):
811 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
812 uncompressed = io.BytesIO(gz.read())
817 raise original_ioerror
818 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
819 resp.msg = old_resp.msg
820 del resp.headers['Content-encoding']
822 if resp.headers.get('Content-encoding', '') == 'deflate':
823 gz = io.BytesIO(self.deflate(resp.read()))
824 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
825 resp.msg = old_resp.msg
826 del resp.headers['Content-encoding']
827 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
828 # https://github.com/rg3/youtube-dl/issues/6457).
829 if 300 <= resp.code < 400:
830 location = resp.headers.get('Location')
832 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
833 if sys.version_info >= (3, 0):
834 location = location.encode('iso-8859-1').decode('utf-8')
835 location_escaped = escape_url(location)
836 if location != location_escaped:
837 del resp.headers['Location']
838 resp.headers['Location'] = location_escaped
841 https_request = http_request
842 https_response = http_response
845 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
846 def __init__(self, params, https_conn_class=None, *args, **kwargs):
847 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
848 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
849 self._params = params
851 def https_open(self, req):
853 if hasattr(self, '_context'): # python > 2.6
854 kwargs['context'] = self._context
855 if hasattr(self, '_check_hostname'): # python 3.x
856 kwargs['check_hostname'] = self._check_hostname
857 return self.do_open(functools.partial(
858 _create_http_connection, self, self._https_conn_class, True),
862 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
863 def __init__(self, cookiejar=None):
864 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
866 def http_response(self, request, response):
867 # Python 2 will choke on next HTTP request in row if there are non-ASCII
868 # characters in Set-Cookie HTTP header of last response (see
869 # https://github.com/rg3/youtube-dl/issues/6769).
870 # In order to at least prevent crashing we will percent encode Set-Cookie
871 # header before HTTPCookieProcessor starts processing it.
872 # if sys.version_info < (3, 0) and response.headers:
873 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
874 # set_cookie = response.headers.get(set_cookie_header)
876 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
877 # if set_cookie != set_cookie_escaped:
878 # del response.headers[set_cookie_header]
879 # response.headers[set_cookie_header] = set_cookie_escaped
880 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
882 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
883 https_response = http_response
886 def parse_iso8601(date_str, delimiter='T', timezone=None):
887 """ Return a UNIX timestamp from the given date """
892 date_str = re.sub(r'\.[0-9]+', '', date_str)
896 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
899 timezone = datetime.timedelta()
901 date_str = date_str[:-len(m.group(0))]
902 if not m.group('sign'):
903 timezone = datetime.timedelta()
905 sign = 1 if m.group('sign') == '+' else -1
906 timezone = datetime.timedelta(
907 hours=sign * int(m.group('hours')),
908 minutes=sign * int(m.group('minutes')))
910 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
911 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912 return calendar.timegm(dt.timetuple())
917 def unified_strdate(date_str, day_first=True):
918 """Return a string with the date in the format YYYYMMDD"""
924 date_str = date_str.replace(',', ' ')
925 # %z (UTC offset) is only supported in python>=3.2
926 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
927 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
928 # Remove AM/PM + timezone
929 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
931 format_expressions = [
944 '%Y-%m-%d %H:%M:%S.%f',
947 '%Y-%m-%dT%H:%M:%SZ',
948 '%Y-%m-%dT%H:%M:%S.%fZ',
949 '%Y-%m-%dT%H:%M:%S.%f0Z',
951 '%Y-%m-%dT%H:%M:%S.%f',
955 format_expressions.extend([
963 format_expressions.extend([
970 for expression in format_expressions:
972 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
975 if upload_date is None:
976 timetuple = email.utils.parsedate_tz(date_str)
978 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
979 if upload_date is not None:
980 return compat_str(upload_date)
983 def determine_ext(url, default_ext='unknown_video'):
986 guess = url.partition('?')[0].rpartition('.')[2]
987 if re.match(r'^[A-Za-z0-9]+$', guess):
989 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
990 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
991 return guess.rstrip('/')
996 def subtitles_filename(filename, sub_lang, sub_format):
997 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1000 def date_from_str(date_str):
1002 Return a datetime object from a string in the format YYYYMMDD or
1003 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1004 today = datetime.date.today()
1005 if date_str in ('now', 'today'):
1007 if date_str == 'yesterday':
1008 return today - datetime.timedelta(days=1)
1009 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1010 if match is not None:
1011 sign = match.group('sign')
1012 time = int(match.group('time'))
1015 unit = match.group('unit')
1016 # A bad approximation?
1020 elif unit == 'year':
1024 delta = datetime.timedelta(**{unit: time})
1025 return today + delta
1026 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1029 def hyphenate_date(date_str):
1031 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1032 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1033 if match is not None:
1034 return '-'.join(match.groups())
1039 class DateRange(object):
1040 """Represents a time interval between two dates"""
1042 def __init__(self, start=None, end=None):
1043 """start and end must be strings in the format accepted by date"""
1044 if start is not None:
1045 self.start = date_from_str(start)
1047 self.start = datetime.datetime.min.date()
1049 self.end = date_from_str(end)
1051 self.end = datetime.datetime.max.date()
1052 if self.start > self.end:
1053 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1057 """Returns a range that only contains the given day"""
1058 return cls(day, day)
1060 def __contains__(self, date):
1061 """Check if the date is in the range"""
1062 if not isinstance(date, datetime.date):
1063 date = date_from_str(date)
1064 return self.start <= date <= self.end
1067 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1070 def platform_name():
1071 """ Returns the platform name as a compat_str """
1072 res = platform.platform()
1073 if isinstance(res, bytes):
1074 res = res.decode(preferredencoding())
1076 assert isinstance(res, compat_str)
1080 def _windows_write_string(s, out):
1081 """ Returns True if the string was written using special methods,
1082 False if it has yet to be written out."""
1083 # Adapted from http://stackoverflow.com/a/3259271/35070
1086 import ctypes.wintypes
1094 fileno = out.fileno()
1095 except AttributeError:
1096 # If the output stream doesn't have a fileno, it's virtual
1098 except io.UnsupportedOperation:
1099 # Some strange Windows pseudo files?
1101 if fileno not in WIN_OUTPUT_IDS:
1104 GetStdHandle = ctypes.WINFUNCTYPE(
1105 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1106 (b'GetStdHandle', ctypes.windll.kernel32))
1107 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1109 WriteConsoleW = ctypes.WINFUNCTYPE(
1110 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1111 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1112 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1113 written = ctypes.wintypes.DWORD(0)
1115 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1116 FILE_TYPE_CHAR = 0x0002
1117 FILE_TYPE_REMOTE = 0x8000
1118 GetConsoleMode = ctypes.WINFUNCTYPE(
1119 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1120 ctypes.POINTER(ctypes.wintypes.DWORD))(
1121 (b'GetConsoleMode', ctypes.windll.kernel32))
1122 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1124 def not_a_console(handle):
1125 if handle == INVALID_HANDLE_VALUE or handle is None:
1127 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1128 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1130 if not_a_console(h):
1133 def next_nonbmp_pos(s):
1135 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1136 except StopIteration:
1140 count = min(next_nonbmp_pos(s), 1024)
1142 ret = WriteConsoleW(
1143 h, s, count if count else 2, ctypes.byref(written), None)
1145 raise OSError('Failed to write string')
1146 if not count: # We just wrote a non-BMP character
1147 assert written.value == 2
1150 assert written.value > 0
1151 s = s[written.value:]
1155 def write_string(s, out=None, encoding=None):
1158 assert type(s) == compat_str
1160 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1161 if _windows_write_string(s, out):
1164 if ('b' in getattr(out, 'mode', '') or
1165 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1166 byt = s.encode(encoding or preferredencoding(), 'ignore')
1168 elif hasattr(out, 'buffer'):
1169 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1170 byt = s.encode(enc, 'ignore')
1171 out.buffer.write(byt)
1177 def bytes_to_intlist(bs):
1180 if isinstance(bs[0], int): # Python 3
1183 return [ord(c) for c in bs]
1186 def intlist_to_bytes(xs):
1189 return struct_pack('%dB' % len(xs), *xs)
1192 # Cross-platform file locking
1193 if sys.platform == 'win32':
1194 import ctypes.wintypes
1197 class OVERLAPPED(ctypes.Structure):
1199 ('Internal', ctypes.wintypes.LPVOID),
1200 ('InternalHigh', ctypes.wintypes.LPVOID),
1201 ('Offset', ctypes.wintypes.DWORD),
1202 ('OffsetHigh', ctypes.wintypes.DWORD),
1203 ('hEvent', ctypes.wintypes.HANDLE),
1206 kernel32 = ctypes.windll.kernel32
1207 LockFileEx = kernel32.LockFileEx
1208 LockFileEx.argtypes = [
1209 ctypes.wintypes.HANDLE, # hFile
1210 ctypes.wintypes.DWORD, # dwFlags
1211 ctypes.wintypes.DWORD, # dwReserved
1212 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1213 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1214 ctypes.POINTER(OVERLAPPED) # Overlapped
1216 LockFileEx.restype = ctypes.wintypes.BOOL
1217 UnlockFileEx = kernel32.UnlockFileEx
1218 UnlockFileEx.argtypes = [
1219 ctypes.wintypes.HANDLE, # hFile
1220 ctypes.wintypes.DWORD, # dwReserved
1221 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1222 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1223 ctypes.POINTER(OVERLAPPED) # Overlapped
1225 UnlockFileEx.restype = ctypes.wintypes.BOOL
1226 whole_low = 0xffffffff
1227 whole_high = 0x7fffffff
1229 def _lock_file(f, exclusive):
1230 overlapped = OVERLAPPED()
1231 overlapped.Offset = 0
1232 overlapped.OffsetHigh = 0
1233 overlapped.hEvent = 0
1234 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1235 handle = msvcrt.get_osfhandle(f.fileno())
1236 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1237 whole_low, whole_high, f._lock_file_overlapped_p):
1238 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1240 def _unlock_file(f):
1241 assert f._lock_file_overlapped_p
1242 handle = msvcrt.get_osfhandle(f.fileno())
1243 if not UnlockFileEx(handle, 0,
1244 whole_low, whole_high, f._lock_file_overlapped_p):
1245 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1250 def _lock_file(f, exclusive):
1251 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1253 def _unlock_file(f):
1254 fcntl.flock(f, fcntl.LOCK_UN)
1257 class locked_file(object):
1258 def __init__(self, filename, mode, encoding=None):
1259 assert mode in ['r', 'a', 'w']
1260 self.f = io.open(filename, mode, encoding=encoding)
1263 def __enter__(self):
1264 exclusive = self.mode != 'r'
1266 _lock_file(self.f, exclusive)
1272 def __exit__(self, etype, value, traceback):
1274 _unlock_file(self.f)
1281 def write(self, *args):
1282 return self.f.write(*args)
1284 def read(self, *args):
1285 return self.f.read(*args)
1288 def get_filesystem_encoding():
1289 encoding = sys.getfilesystemencoding()
1290 return encoding if encoding is not None else 'utf-8'
1293 def shell_quote(args):
1295 encoding = get_filesystem_encoding()
1297 if isinstance(a, bytes):
1298 # We may get a filename encoded with 'encodeFilename'
1299 a = a.decode(encoding)
1300 quoted_args.append(pipes.quote(a))
1301 return ' '.join(quoted_args)
1304 def smuggle_url(url, data):
1305 """ Pass additional data in a URL for internal use. """
1307 sdata = compat_urllib_parse.urlencode(
1308 {'__youtubedl_smuggle': json.dumps(data)})
1309 return url + '#' + sdata
1312 def unsmuggle_url(smug_url, default=None):
1313 if '#__youtubedl_smuggle' not in smug_url:
1314 return smug_url, default
1315 url, _, sdata = smug_url.rpartition('#')
1316 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1317 data = json.loads(jsond)
1321 def format_bytes(bytes):
1324 if type(bytes) is str:
1325 bytes = float(bytes)
1329 exponent = int(math.log(bytes, 1024.0))
1330 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1331 converted = float(bytes) / float(1024 ** exponent)
1332 return '%.2f%s' % (converted, suffix)
1335 def parse_filesize(s):
1339 # The lower-case forms are of course incorrect and unofficial,
1340 # but we support those too
1378 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1380 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1384 num_str = m.group('num').replace(',', '.')
1385 mult = _UNIT_TABLE[m.group('unit')]
1386 return int(float(num_str) * mult)
1389 def month_by_name(name):
1390 """ Return the number of a month by (locale-independently) English name """
1393 return ENGLISH_MONTH_NAMES.index(name) + 1
1398 def month_by_abbreviation(abbrev):
1399 """ Return the number of a month by (locale-independently) English
1403 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1408 def fix_xml_ampersands(xml_str):
1409 """Replace all the '&' by '&' in XML"""
1411 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1416 def setproctitle(title):
1417 assert isinstance(title, compat_str)
1419 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1422 title_bytes = title.encode('utf-8')
1423 buf = ctypes.create_string_buffer(len(title_bytes))
1424 buf.value = title_bytes
1426 libc.prctl(15, buf, 0, 0, 0)
1427 except AttributeError:
1428 return # Strange libc, just skip this
1431 def remove_start(s, start):
1432 if s.startswith(start):
1433 return s[len(start):]
1437 def remove_end(s, end):
1439 return s[:-len(end)]
1443 def remove_quotes(s):
1444 if s is None or len(s) < 2:
1446 for quote in ('"', "'", ):
1447 if s[0] == quote and s[-1] == quote:
1452 def url_basename(url):
1453 path = compat_urlparse.urlparse(url).path
1454 return path.strip('/').split('/')[-1]
1457 class HEADRequest(compat_urllib_request.Request):
1458 def get_method(self):
1462 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1465 v = getattr(v, get_attr, None)
1471 return int(v) * invscale // scale
1476 def str_or_none(v, default=None):
1477 return default if v is None else compat_str(v)
1480 def str_to_int(int_str):
1481 """ A more relaxed version of int_or_none """
1484 int_str = re.sub(r'[,\.\+]', '', int_str)
1488 def float_or_none(v, scale=1, invscale=1, default=None):
1492 return float(v) * invscale / scale
1497 def parse_duration(s):
1498 if not isinstance(s, compat_basestring):
1506 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1507 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1509 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1512 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1513 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1515 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1517 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1522 if m.group('only_mins'):
1523 return float_or_none(m.group('only_mins'), invscale=60)
1524 if m.group('only_hours'):
1525 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1527 res += int(m.group('secs'))
1528 if m.group('mins_reversed'):
1529 res += int(m.group('mins_reversed')) * 60
1531 res += int(m.group('mins')) * 60
1532 if m.group('hours'):
1533 res += int(m.group('hours')) * 60 * 60
1534 if m.group('hours_reversed'):
1535 res += int(m.group('hours_reversed')) * 60 * 60
1537 res += int(m.group('days')) * 24 * 60 * 60
1539 res += float(m.group('ms'))
1543 def prepend_extension(filename, ext, expected_real_ext=None):
1544 name, real_ext = os.path.splitext(filename)
1546 '{0}.{1}{2}'.format(name, ext, real_ext)
1547 if not expected_real_ext or real_ext[1:] == expected_real_ext
1548 else '{0}.{1}'.format(filename, ext))
1551 def replace_extension(filename, ext, expected_real_ext=None):
1552 name, real_ext = os.path.splitext(filename)
1553 return '{0}.{1}'.format(
1554 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1558 def check_executable(exe, args=[]):
1559 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1560 args can be a list of arguments for a short output (like -version) """
1562 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1568 def get_exe_version(exe, args=['--version'],
1569 version_re=None, unrecognized='present'):
1570 """ Returns the version of the specified executable,
1571 or False if the executable is not present """
1573 out, _ = subprocess.Popen(
1574 [encodeArgument(exe)] + args,
1575 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1578 if isinstance(out, bytes): # Python 2.x
1579 out = out.decode('ascii', 'ignore')
1580 return detect_exe_version(out, version_re, unrecognized)
1583 def detect_exe_version(output, version_re=None, unrecognized='present'):
1584 assert isinstance(output, compat_str)
1585 if version_re is None:
1586 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1587 m = re.search(version_re, output)
1594 class PagedList(object):
1596 # This is only useful for tests
1597 return len(self.getslice())
1600 class OnDemandPagedList(PagedList):
1601 def __init__(self, pagefunc, pagesize, use_cache=False):
1602 self._pagefunc = pagefunc
1603 self._pagesize = pagesize
1604 self._use_cache = use_cache
1608 def getslice(self, start=0, end=None):
1610 for pagenum in itertools.count(start // self._pagesize):
1611 firstid = pagenum * self._pagesize
1612 nextfirstid = pagenum * self._pagesize + self._pagesize
1613 if start >= nextfirstid:
1618 page_results = self._cache.get(pagenum)
1619 if page_results is None:
1620 page_results = list(self._pagefunc(pagenum))
1622 self._cache[pagenum] = page_results
1625 start % self._pagesize
1626 if firstid <= start < nextfirstid
1630 ((end - 1) % self._pagesize) + 1
1631 if (end is not None and firstid <= end <= nextfirstid)
1634 if startv != 0 or endv is not None:
1635 page_results = page_results[startv:endv]
1636 res.extend(page_results)
1638 # A little optimization - if current page is not "full", ie. does
1639 # not contain page_size videos then we can assume that this page
1640 # is the last one - there are no more ids on further pages -
1641 # i.e. no need to query again.
1642 if len(page_results) + startv < self._pagesize:
1645 # If we got the whole page, but the next page is not interesting,
1646 # break out early as well
1647 if end == nextfirstid:
1652 class InAdvancePagedList(PagedList):
1653 def __init__(self, pagefunc, pagecount, pagesize):
1654 self._pagefunc = pagefunc
1655 self._pagecount = pagecount
1656 self._pagesize = pagesize
1658 def getslice(self, start=0, end=None):
1660 start_page = start // self._pagesize
1662 self._pagecount if end is None else (end // self._pagesize + 1))
1663 skip_elems = start - start_page * self._pagesize
1664 only_more = None if end is None else end - start
1665 for pagenum in range(start_page, end_page):
1666 page = list(self._pagefunc(pagenum))
1668 page = page[skip_elems:]
1670 if only_more is not None:
1671 if len(page) < only_more:
1672 only_more -= len(page)
1674 page = page[:only_more]
1681 def uppercase_escape(s):
1682 unicode_escape = codecs.getdecoder('unicode_escape')
1684 r'\\U[0-9a-fA-F]{8}',
1685 lambda m: unicode_escape(m.group(0))[0],
1689 def lowercase_escape(s):
1690 unicode_escape = codecs.getdecoder('unicode_escape')
1692 r'\\u[0-9a-fA-F]{4}',
1693 lambda m: unicode_escape(m.group(0))[0],
1697 def escape_rfc3986(s):
1698 """Escape non-ASCII characters as suggested by RFC 3986"""
1699 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1700 s = s.encode('utf-8')
1701 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1704 def escape_url(url):
1705 """Escape URL as suggested by RFC 3986"""
1706 url_parsed = compat_urllib_parse_urlparse(url)
1707 return url_parsed._replace(
1708 path=escape_rfc3986(url_parsed.path),
1709 params=escape_rfc3986(url_parsed.params),
1710 query=escape_rfc3986(url_parsed.query),
1711 fragment=escape_rfc3986(url_parsed.fragment)
1715 struct.pack('!I', 0)
1717 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1718 def struct_pack(spec, *args):
1719 if isinstance(spec, compat_str):
1720 spec = spec.encode('ascii')
1721 return struct.pack(spec, *args)
1723 def struct_unpack(spec, *args):
1724 if isinstance(spec, compat_str):
1725 spec = spec.encode('ascii')
1726 return struct.unpack(spec, *args)
1728 struct_pack = struct.pack
1729 struct_unpack = struct.unpack
1732 def read_batch_urls(batch_fd):
1734 if not isinstance(url, compat_str):
1735 url = url.decode('utf-8', 'replace')
1736 BOM_UTF8 = '\xef\xbb\xbf'
1737 if url.startswith(BOM_UTF8):
1738 url = url[len(BOM_UTF8):]
1740 if url.startswith(('#', ';', ']')):
1744 with contextlib.closing(batch_fd) as fd:
1745 return [url for url in map(fixup, fd) if url]
1748 def urlencode_postdata(*args, **kargs):
1749 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1752 def encode_dict(d, encoding='utf-8'):
1754 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1755 return dict((encode(k), encode(v)) for k, v in d.items())
1758 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1759 if isinstance(key_or_keys, (list, tuple)):
1760 for key in key_or_keys:
1761 if key not in d or d[key] is None or skip_false_values and not d[key]:
1765 return d.get(key_or_keys, default)
1768 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1769 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1781 def parse_age_limit(s):
1784 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1785 return int(m.group('age')) if m else US_RATINGS.get(s)
1788 def strip_jsonp(code):
1790 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1793 def js_to_json(code):
1796 if v in ('true', 'false', 'null'):
1798 if v.startswith('"'):
1799 v = re.sub(r"\\'", "'", v[1:-1])
1800 elif v.startswith("'"):
1802 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1809 res = re.sub(r'''(?x)
1810 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1811 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1812 [a-zA-Z_][.a-zA-Z_0-9]*
1814 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1818 def qualities(quality_ids):
1819 """ Get a numeric quality value out of a list of possible values """
1822 return quality_ids.index(qid)
1828 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1831 def limit_length(s, length):
1832 """ Add ellipses to overly long strings """
1837 return s[:length - len(ELLIPSES)] + ELLIPSES
1841 def version_tuple(v):
1842 return tuple(int(e) for e in re.split(r'[-.]', v))
1845 def is_outdated_version(version, limit, assume_new=True):
1847 return not assume_new
1849 return version_tuple(version) < version_tuple(limit)
1851 return not assume_new
1854 def ytdl_is_updateable():
1855 """ Returns if youtube-dl can be updated with -U """
1856 from zipimport import zipimporter
1858 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1861 def args_to_str(args):
1862 # Get a short string representation for a subprocess command
1863 return ' '.join(shlex_quote(a) for a in args)
1866 def error_to_compat_str(err):
1868 # On python 2 error byte string must be decoded with proper
1869 # encoding rather than ascii
1870 if sys.version_info[0] < 3:
1871 err_str = err_str.decode(preferredencoding())
1875 def mimetype2ext(mt):
1882 _, _, res = mt.rpartition('/')
1886 'smptett+xml': 'tt',
1892 'x-mp4-fragmented': 'mp4',
1897 def urlhandle_detect_ext(url_handle):
1900 getheader = lambda h: url_handle.headers[h]
1901 except AttributeError: # Python < 3
1902 getheader = url_handle.info().getheader
1904 cd = getheader('Content-Disposition')
1906 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1908 e = determine_ext(m.group('filename'), default_ext=None)
1912 return mimetype2ext(getheader('Content-Type'))
1915 def encode_data_uri(data, mime_type):
1916 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1919 def age_restricted(content_limit, age_limit):
1920 """ Returns True iff the content should be blocked """
1922 if age_limit is None: # No limit set
1924 if content_limit is None:
1925 return False # Content available for everyone
1926 return age_limit < content_limit
1929 def is_html(first_bytes):
1930 """ Detect whether a file contains HTML by examining its first bytes. """
1933 (b'\xef\xbb\xbf', 'utf-8'),
1934 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1935 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1936 (b'\xff\xfe', 'utf-16-le'),
1937 (b'\xfe\xff', 'utf-16-be'),
1939 for bom, enc in BOMS:
1940 if first_bytes.startswith(bom):
1941 s = first_bytes[len(bom):].decode(enc, 'replace')
1944 s = first_bytes.decode('utf-8', 'replace')
1946 return re.match(r'^\s*<', s)
1949 def determine_protocol(info_dict):
1950 protocol = info_dict.get('protocol')
1951 if protocol is not None:
1954 url = info_dict['url']
1955 if url.startswith('rtmp'):
1957 elif url.startswith('mms'):
1959 elif url.startswith('rtsp'):
1962 ext = determine_ext(url)
1968 return compat_urllib_parse_urlparse(url).scheme
1971 def render_table(header_row, data):
1972 """ Render a list of rows, each as a list of values """
1973 table = [header_row] + data
1974 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1975 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1976 return '\n'.join(format_str % tuple(row) for row in table)
1979 def _match_one(filter_part, dct):
1980 COMPARISON_OPERATORS = {
1988 operator_rex = re.compile(r'''(?x)\s*
1990 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1992 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1993 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1996 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1997 m = operator_rex.search(filter_part)
1999 op = COMPARISON_OPERATORS[m.group('op')]
2000 if m.group('strval') is not None:
2001 if m.group('op') not in ('=', '!='):
2003 'Operator %s does not support string values!' % m.group('op'))
2004 comparison_value = m.group('strval')
2007 comparison_value = int(m.group('intval'))
2009 comparison_value = parse_filesize(m.group('intval'))
2010 if comparison_value is None:
2011 comparison_value = parse_filesize(m.group('intval') + 'B')
2012 if comparison_value is None:
2014 'Invalid integer value %r in filter part %r' % (
2015 m.group('intval'), filter_part))
2016 actual_value = dct.get(m.group('key'))
2017 if actual_value is None:
2018 return m.group('none_inclusive')
2019 return op(actual_value, comparison_value)
2022 '': lambda v: v is not None,
2023 '!': lambda v: v is None,
2025 operator_rex = re.compile(r'''(?x)\s*
2026 (?P<op>%s)\s*(?P<key>[a-z_]+)
2028 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2029 m = operator_rex.search(filter_part)
2031 op = UNARY_OPERATORS[m.group('op')]
2032 actual_value = dct.get(m.group('key'))
2033 return op(actual_value)
2035 raise ValueError('Invalid filter part %r' % filter_part)
2038 def match_str(filter_str, dct):
2039 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2042 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2045 def match_filter_func(filter_str):
2046 def _match_func(info_dict):
2047 if match_str(filter_str, info_dict):
2050 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2051 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2055 def parse_dfxp_time_expr(time_expr):
2059 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2061 return float(mobj.group('time_offset'))
2063 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2065 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2068 def srt_subtitles_timecode(seconds):
2069 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2072 def dfxp2srt(dfxp_data):
2073 _x = functools.partial(xpath_with_ns, ns_map={
2074 'ttml': 'http://www.w3.org/ns/ttml',
2075 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2078 class TTMLPElementParser(object):
2081 def start(self, tag, attrib):
2082 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2088 def data(self, data):
2092 return self.out.strip()
2094 def parse_node(node):
2095 target = TTMLPElementParser()
2096 parser = xml.etree.ElementTree.XMLParser(target=target)
2097 parser.feed(xml.etree.ElementTree.tostring(node))
2098 return parser.close()
2100 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2102 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2105 raise ValueError('Invalid dfxp/TTML subtitle')
2107 for para, index in zip(paras, itertools.count(1)):
2108 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2109 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2110 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2111 if begin_time is None:
2116 end_time = begin_time + dur
2117 out.append('%d\n%s --> %s\n%s\n\n' % (
2119 srt_subtitles_timecode(begin_time),
2120 srt_subtitles_timecode(end_time),
2126 def cli_option(params, command_option, param):
2127 param = params.get(param)
2128 return [command_option, param] if param is not None else []
2131 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2132 param = params.get(param)
2133 assert isinstance(param, bool)
2135 return [command_option + separator + (true_value if param else false_value)]
2136 return [command_option, true_value if param else false_value]
2139 def cli_valueless_option(params, command_option, param, expected_value=True):
2140 param = params.get(param)
2141 return [command_option] if param == expected_value else []
2144 def cli_configuration_args(params, param, default=[]):
2145 ex_args = params.get(param)
2148 assert isinstance(ex_args, list)
2152 class ISO639Utils(object):
2153 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2342 def short2long(cls, code):
2343 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2344 return cls._lang_map.get(code[:2])
2347 def long2short(cls, code):
2348 """Convert language code from ISO 639-2/T to ISO 639-1"""
2349 for short_name, long_name in cls._lang_map.items():
2350 if long_name == code:
2354 class ISO3166Utils(object):
2355 # From http://data.okfn.org/data/core/country-list
2357 'AF': 'Afghanistan',
2358 'AX': 'Ã…land Islands',
2361 'AS': 'American Samoa',
2366 'AG': 'Antigua and Barbuda',
2383 'BO': 'Bolivia, Plurinational State of',
2384 'BQ': 'Bonaire, Sint Eustatius and Saba',
2385 'BA': 'Bosnia and Herzegovina',
2387 'BV': 'Bouvet Island',
2389 'IO': 'British Indian Ocean Territory',
2390 'BN': 'Brunei Darussalam',
2392 'BF': 'Burkina Faso',
2398 'KY': 'Cayman Islands',
2399 'CF': 'Central African Republic',
2403 'CX': 'Christmas Island',
2404 'CC': 'Cocos (Keeling) Islands',
2408 'CD': 'Congo, the Democratic Republic of the',
2409 'CK': 'Cook Islands',
2411 'CI': 'Côte d\'Ivoire',
2416 'CZ': 'Czech Republic',
2420 'DO': 'Dominican Republic',
2423 'SV': 'El Salvador',
2424 'GQ': 'Equatorial Guinea',
2428 'FK': 'Falkland Islands (Malvinas)',
2429 'FO': 'Faroe Islands',
2433 'GF': 'French Guiana',
2434 'PF': 'French Polynesia',
2435 'TF': 'French Southern Territories',
2450 'GW': 'Guinea-Bissau',
2453 'HM': 'Heard Island and McDonald Islands',
2454 'VA': 'Holy See (Vatican City State)',
2461 'IR': 'Iran, Islamic Republic of',
2464 'IM': 'Isle of Man',
2474 'KP': 'Korea, Democratic People\'s Republic of',
2475 'KR': 'Korea, Republic of',
2478 'LA': 'Lao People\'s Democratic Republic',
2484 'LI': 'Liechtenstein',
2488 'MK': 'Macedonia, the Former Yugoslav Republic of',
2495 'MH': 'Marshall Islands',
2501 'FM': 'Micronesia, Federated States of',
2502 'MD': 'Moldova, Republic of',
2513 'NL': 'Netherlands',
2514 'NC': 'New Caledonia',
2515 'NZ': 'New Zealand',
2520 'NF': 'Norfolk Island',
2521 'MP': 'Northern Mariana Islands',
2526 'PS': 'Palestine, State of',
2528 'PG': 'Papua New Guinea',
2531 'PH': 'Philippines',
2535 'PR': 'Puerto Rico',
2539 'RU': 'Russian Federation',
2541 'BL': 'Saint Barthélemy',
2542 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2543 'KN': 'Saint Kitts and Nevis',
2544 'LC': 'Saint Lucia',
2545 'MF': 'Saint Martin (French part)',
2546 'PM': 'Saint Pierre and Miquelon',
2547 'VC': 'Saint Vincent and the Grenadines',
2550 'ST': 'Sao Tome and Principe',
2551 'SA': 'Saudi Arabia',
2555 'SL': 'Sierra Leone',
2557 'SX': 'Sint Maarten (Dutch part)',
2560 'SB': 'Solomon Islands',
2562 'ZA': 'South Africa',
2563 'GS': 'South Georgia and the South Sandwich Islands',
2564 'SS': 'South Sudan',
2569 'SJ': 'Svalbard and Jan Mayen',
2572 'CH': 'Switzerland',
2573 'SY': 'Syrian Arab Republic',
2574 'TW': 'Taiwan, Province of China',
2576 'TZ': 'Tanzania, United Republic of',
2578 'TL': 'Timor-Leste',
2582 'TT': 'Trinidad and Tobago',
2585 'TM': 'Turkmenistan',
2586 'TC': 'Turks and Caicos Islands',
2590 'AE': 'United Arab Emirates',
2591 'GB': 'United Kingdom',
2592 'US': 'United States',
2593 'UM': 'United States Minor Outlying Islands',
2597 'VE': 'Venezuela, Bolivarian Republic of',
2599 'VG': 'Virgin Islands, British',
2600 'VI': 'Virgin Islands, U.S.',
2601 'WF': 'Wallis and Futuna',
2602 'EH': 'Western Sahara',
2609 def short2full(cls, code):
2610 """Convert an ISO 3166-2 country code to the corresponding full name"""
2611 return cls._country_map.get(code.upper())
2614 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2615 def __init__(self, proxies=None):
2616 # Set default handlers
2617 for type in ('http', 'https'):
2618 setattr(self, '%s_open' % type,
2619 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2620 meth(r, proxy, type))
2621 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2623 def proxy_open(self, req, proxy, type):
2624 req_proxy = req.headers.get('Ytdl-request-proxy')
2625 if req_proxy is not None:
2627 del req.headers['Ytdl-request-proxy']
2629 if proxy == '__noproxy__':
2630 return None # No Proxy
2631 return compat_urllib_request.ProxyHandler.proxy_open(
2632 self, req, proxy, type)
2635 def ohdave_rsa_encrypt(data, exponent, modulus):
2637 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2640 data: data to encrypt, bytes-like object
2641 exponent, modulus: parameter e and N of RSA algorithm, both integer
2642 Output: hex string of encrypted data
2644 Limitation: supports one block encryption only
2647 payload = int(binascii.hexlify(data[::-1]), 16)
2648 encrypted = pow(payload, exponent, modulus)
2649 return '%x' % encrypted
2652 def encode_base_n(num, n, table=None):
2653 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2655 table = FULL_TABLE[:n]
2658 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2665 ret = table[num % n] + ret
2670 def decode_packed_codes(code):
2672 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2674 obfucasted_code, base, count, symbols = mobj.groups()
2677 symbols = symbols.split('|')
2682 base_n_count = encode_base_n(count, base)
2683 symbol_table[base_n_count] = symbols[count] or base_n_count
2686 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],