2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
39 compat_etree_fromstring,
44 compat_socket_create_connection,
48 compat_urllib_parse_urlparse,
49 compat_urllib_request,
55 # This is not clearly defined otherwise
56 compiled_regex_type = type(re.compile(''))
59 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
60 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
61 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
62 'Accept-Encoding': 'gzip, deflate',
63 'Accept-Language': 'en-us,en;q=0.5',
69 ENGLISH_MONTH_NAMES = [
70 'January', 'February', 'March', 'April', 'May', 'June',
71 'July', 'August', 'September', 'October', 'November', 'December']
74 def preferredencoding():
75 """Get preferred encoding.
77 Returns the best encoding scheme for the system, based on
78 locale.getpreferredencoding() and some further tweaks.
81 pref = locale.getpreferredencoding()
89 def write_json_file(obj, fn):
90 """ Encode obj as JSON and write it to fn, atomically if possible """
92 fn = encodeFilename(fn)
93 if sys.version_info < (3, 0) and sys.platform != 'win32':
94 encoding = get_filesystem_encoding()
95 # os.path.basename returns a bytes object, but NamedTemporaryFile
96 # will fail if the filename contains non ascii characters unless we
97 # use a unicode object
98 path_basename = lambda f: os.path.basename(fn).decode(encoding)
99 # the same for os.path.dirname
100 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
102 path_basename = os.path.basename
103 path_dirname = os.path.dirname
107 'prefix': path_basename(fn) + '.',
108 'dir': path_dirname(fn),
112 # In Python 2.x, json.dump expects a bytestream.
113 # In Python 3.x, it writes to a character stream
114 if sys.version_info < (3, 0):
122 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
127 if sys.platform == 'win32':
128 # Need to remove existing file on Windows, else os.rename raises
129 # WindowsError or FileExistsError.
134 os.rename(tf.name, fn)
143 if sys.version_info >= (2, 7):
144 def find_xpath_attr(node, xpath, key, val=None):
145 """ Find the xpath xpath[@key=val] """
146 assert re.match(r'^[a-zA-Z_-]+$', key)
148 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
149 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
150 return node.find(expr)
152 def find_xpath_attr(node, xpath, key, val=None):
153 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
154 # .//node does not match if a node is a direct child of . !
155 if isinstance(xpath, compat_str):
156 xpath = xpath.encode('ascii')
158 for f in node.findall(xpath):
159 if key not in f.attrib:
161 if val is None or f.attrib.get(key) == val:
165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
166 # the namespace parameter
169 def xpath_with_ns(path, ns_map):
170 components = [c.split(':') for c in path.split('/')]
174 replaced.append(c[0])
177 replaced.append('{%s}%s' % (ns_map[ns], tag))
178 return '/'.join(replaced)
181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
182 def _find_xpath(xpath):
183 if sys.version_info < (2, 7): # Crazy 2.6
184 xpath = xpath.encode('ascii')
185 return node.find(xpath)
187 if isinstance(xpath, (str, compat_str)):
188 n = _find_xpath(xpath)
196 if default is not NO_DEFAULT:
199 name = xpath if name is None else name
200 raise ExtractorError('Could not find XML element %s' % name)
206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
207 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
208 if n is None or n == default:
211 if default is not NO_DEFAULT:
214 name = xpath if name is None else name
215 raise ExtractorError('Could not find XML element\'s text %s' % name)
221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
222 n = find_xpath_attr(node, xpath, key)
224 if default is not NO_DEFAULT:
227 name = '%s[@%s]' % (xpath, key) if name is None else name
228 raise ExtractorError('Could not find XML attribute %s' % name)
234 def get_element_by_id(id, html):
235 """Return the content of the tag with the specified ID in the passed HTML document"""
236 return get_element_by_attribute("id", id, html)
239 def get_element_by_attribute(attribute, value, html):
240 """Return the content of the tag with the specified attribute in the passed HTML document"""
242 m = re.search(r'''(?xs)
244 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
246 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
250 ''' % (re.escape(attribute), re.escape(value)), html)
254 res = m.group('content')
256 if res.startswith('"') or res.startswith("'"):
259 return unescapeHTML(res)
262 def clean_html(html):
263 """Clean an HTML snippet into a readable string"""
265 if html is None: # Convenience for sanitizing descriptions etc.
269 html = html.replace('\n', ' ')
270 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
271 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
273 html = re.sub('<.*?>', '', html)
274 # Replace html entities
275 html = unescapeHTML(html)
279 def sanitize_open(filename, open_mode):
280 """Try to open the given filename, and slightly tweak it if this fails.
282 Attempts to open the given filename. If this fails, it tries to change
283 the filename slightly, step by step, until it's either able to open it
284 or it fails and raises a final exception, like the standard open()
287 It returns the tuple (stream, definitive_file_name).
291 if sys.platform == 'win32':
293 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
294 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
295 stream = open(encodeFilename(filename), open_mode)
296 return (stream, filename)
297 except (IOError, OSError) as err:
298 if err.errno in (errno.EACCES,):
301 # In case of error, try to remove win32 forbidden chars
302 alt_filename = sanitize_path(filename)
303 if alt_filename == filename:
306 # An exception here should be caught in the caller
307 stream = open(encodeFilename(alt_filename), open_mode)
308 return (stream, alt_filename)
311 def timeconvert(timestr):
312 """Convert RFC 2822 defined time string into system timestamp"""
314 timetuple = email.utils.parsedate_tz(timestr)
315 if timetuple is not None:
316 timestamp = email.utils.mktime_tz(timetuple)
320 def sanitize_filename(s, restricted=False, is_id=False):
321 """Sanitizes a string so it could be used as part of a filename.
322 If restricted is set, use a stricter subset of allowed characters.
323 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
325 def replace_insane(char):
326 if char == '?' or ord(char) < 32 or ord(char) == 127:
329 return '' if restricted else '\''
331 return '_-' if restricted else ' -'
332 elif char in '\\/|*<>':
334 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
336 if restricted and ord(char) > 127:
341 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
342 result = ''.join(map(replace_insane, s))
344 while '__' in result:
345 result = result.replace('__', '_')
346 result = result.strip('_')
347 # Common case of "Foreign band name - English song title"
348 if restricted and result.startswith('-_'):
350 if result.startswith('-'):
351 result = '_' + result[len('-'):]
352 result = result.lstrip('.')
358 def sanitize_path(s):
359 """Sanitizes and normalizes path on Windows"""
360 if sys.platform != 'win32':
362 drive_or_unc, _ = os.path.splitdrive(s)
363 if sys.version_info < (2, 7) and not drive_or_unc:
364 drive_or_unc, _ = os.path.splitunc(s)
365 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
369 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
370 for path_part in norm_path]
372 sanitized_path.insert(0, drive_or_unc + os.path.sep)
373 return os.path.join(*sanitized_path)
376 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
377 # unwanted failures due to missing protocol
378 def sanitized_Request(url, *args, **kwargs):
379 return compat_urllib_request.Request(
380 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
383 def orderedSet(iterable):
384 """ Remove all duplicates from the input iterable """
392 def _htmlentity_transform(entity):
393 """Transforms an HTML entity to a character."""
394 # Known non-numeric HTML entity
395 if entity in compat_html_entities.name2codepoint:
396 return compat_chr(compat_html_entities.name2codepoint[entity])
398 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
400 numstr = mobj.group(1)
401 if numstr.startswith('x'):
403 numstr = '0%s' % numstr
406 # See https://github.com/rg3/youtube-dl/issues/7518
408 return compat_chr(int(numstr, base))
412 # Unknown entity in name, return its literal representation
413 return '&%s;' % entity
419 assert type(s) == compat_str
422 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
425 def get_subprocess_encoding():
426 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
427 # For subprocess calls, encode with locale encoding
428 # Refer to http://stackoverflow.com/a/9951851/35070
429 encoding = preferredencoding()
431 encoding = sys.getfilesystemencoding()
437 def encodeFilename(s, for_subprocess=False):
439 @param s The name of the file
442 assert type(s) == compat_str
444 # Python 3 has a Unicode API
445 if sys.version_info >= (3, 0):
448 # Pass '' directly to use Unicode APIs on Windows 2000 and up
449 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
450 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
451 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
454 return s.encode(get_subprocess_encoding(), 'ignore')
457 def decodeFilename(b, for_subprocess=False):
459 if sys.version_info >= (3, 0):
462 if not isinstance(b, bytes):
465 return b.decode(get_subprocess_encoding(), 'ignore')
468 def encodeArgument(s):
469 if not isinstance(s, compat_str):
470 # Legacy code that uses byte strings
471 # Uncomment the following line after fixing all post processors
472 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
473 s = s.decode('ascii')
474 return encodeFilename(s, True)
477 def decodeArgument(b):
478 return decodeFilename(b, True)
481 def decodeOption(optval):
484 if isinstance(optval, bytes):
485 optval = optval.decode(preferredencoding())
487 assert isinstance(optval, compat_str)
491 def formatSeconds(secs):
493 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
495 return '%d:%02d' % (secs // 60, secs % 60)
500 def make_HTTPS_handler(params, **kwargs):
501 opts_no_check_certificate = params.get('nocheckcertificate', False)
502 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
503 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
504 if opts_no_check_certificate:
505 context.check_hostname = False
506 context.verify_mode = ssl.CERT_NONE
508 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
511 # (create_default_context present but HTTPSHandler has no context=)
514 if sys.version_info < (3, 2):
515 return YoutubeDLHTTPSHandler(params, **kwargs)
517 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
518 context.verify_mode = (ssl.CERT_NONE
519 if opts_no_check_certificate
520 else ssl.CERT_REQUIRED)
521 context.set_default_verify_paths()
522 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
525 def bug_reports_message():
526 if ytdl_is_updateable():
527 update_cmd = 'type youtube-dl -U to update'
529 update_cmd = 'see https://yt-dl.org/update on how to update'
530 msg = '; please report this issue on https://yt-dl.org/bug .'
531 msg += ' Make sure you are using the latest version; %s.' % update_cmd
532 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
536 class ExtractorError(Exception):
537 """Error during info extraction."""
539 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
540 """ tb, if given, is the original traceback (so that it can be printed out).
541 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
544 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
546 if video_id is not None:
547 msg = video_id + ': ' + msg
549 msg += ' (caused by %r)' % cause
551 msg += bug_reports_message()
552 super(ExtractorError, self).__init__(msg)
555 self.exc_info = sys.exc_info() # preserve original exception
557 self.video_id = video_id
559 def format_traceback(self):
560 if self.traceback is None:
562 return ''.join(traceback.format_tb(self.traceback))
565 class UnsupportedError(ExtractorError):
566 def __init__(self, url):
567 super(UnsupportedError, self).__init__(
568 'Unsupported URL: %s' % url, expected=True)
572 class RegexNotFoundError(ExtractorError):
573 """Error when a regex didn't match"""
577 class DownloadError(Exception):
578 """Download Error exception.
580 This exception may be thrown by FileDownloader objects if they are not
581 configured to continue on errors. They will contain the appropriate
585 def __init__(self, msg, exc_info=None):
586 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
587 super(DownloadError, self).__init__(msg)
588 self.exc_info = exc_info
591 class SameFileError(Exception):
592 """Same File exception.
594 This exception will be thrown by FileDownloader objects if they detect
595 multiple files would have to be downloaded to the same file on disk.
600 class PostProcessingError(Exception):
601 """Post Processing exception.
603 This exception may be raised by PostProcessor's .run() method to
604 indicate an error in the postprocessing task.
607 def __init__(self, msg):
611 class MaxDownloadsReached(Exception):
612 """ --max-downloads limit has been reached. """
616 class UnavailableVideoError(Exception):
617 """Unavailable Format exception.
619 This exception will be thrown when a video is requested
620 in a format that is not available for that video.
625 class ContentTooShortError(Exception):
626 """Content Too Short exception.
628 This exception may be raised by FileDownloader objects when a file they
629 download is too small for what the server announced first, indicating
630 the connection was probably interrupted.
633 def __init__(self, downloaded, expected):
635 self.downloaded = downloaded
636 self.expected = expected
639 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
640 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
641 # expected HTTP responses to meet HTTP/1.0 or later (see also
642 # https://github.com/rg3/youtube-dl/issues/6727)
643 if sys.version_info < (3, 0):
644 kwargs[b'strict'] = True
645 hc = http_class(*args, **kwargs)
646 source_address = ydl_handler._params.get('source_address')
647 if source_address is not None:
648 sa = (source_address, 0)
649 if hasattr(hc, 'source_address'): # Python 2.7+
650 hc.source_address = sa
652 def _hc_connect(self, *args, **kwargs):
653 sock = compat_socket_create_connection(
654 (self.host, self.port), self.timeout, sa)
656 self.sock = ssl.wrap_socket(
657 sock, self.key_file, self.cert_file,
658 ssl_version=ssl.PROTOCOL_TLSv1)
661 hc.connect = functools.partial(_hc_connect, hc)
666 def handle_youtubedl_headers(headers):
667 filtered_headers = headers
669 if 'Youtubedl-no-compression' in filtered_headers:
670 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
671 del filtered_headers['Youtubedl-no-compression']
673 return filtered_headers
676 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
677 """Handler for HTTP requests and responses.
679 This class, when installed with an OpenerDirector, automatically adds
680 the standard headers to every HTTP request and handles gzipped and
681 deflated responses from web servers. If compression is to be avoided in
682 a particular request, the original request in the program code only has
683 to include the HTTP header "Youtubedl-no-compression", which will be
684 removed before making the real request.
686 Part of this code was copied from:
688 http://techknack.net/python-urllib2-handlers/
690 Andrew Rowls, the author of that code, agreed to release it to the
694 def __init__(self, params, *args, **kwargs):
695 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
696 self._params = params
698 def http_open(self, req):
699 return self.do_open(functools.partial(
700 _create_http_connection, self, compat_http_client.HTTPConnection, False),
706 return zlib.decompress(data, -zlib.MAX_WBITS)
708 return zlib.decompress(data)
711 def addinfourl_wrapper(stream, headers, url, code):
712 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
713 return compat_urllib_request.addinfourl(stream, headers, url, code)
714 ret = compat_urllib_request.addinfourl(stream, headers, url)
718 def http_request(self, req):
719 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
720 # always respected by websites, some tend to give out URLs with non percent-encoded
721 # non-ASCII characters (see telemb.py, ard.py [#3412])
722 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
723 # To work around aforementioned issue we will replace request's original URL with
724 # percent-encoded one
725 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
726 # the code of this workaround has been moved here from YoutubeDL.urlopen()
727 url = req.get_full_url()
728 url_escaped = escape_url(url)
730 # Substitute URL if any change after escaping
731 if url != url_escaped:
732 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
734 url_escaped, data=req.data, headers=req.headers,
735 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
736 new_req.timeout = req.timeout
739 for h, v in std_headers.items():
740 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
741 # The dict keys are capitalized because of this bug by urllib
742 if h.capitalize() not in req.headers:
745 req.headers = handle_youtubedl_headers(req.headers)
747 if sys.version_info < (2, 7) and '#' in req.get_full_url():
748 # Python 2.6 is brain-dead when it comes to fragments
749 req._Request__original = req._Request__original.partition('#')[0]
750 req._Request__r_type = req._Request__r_type.partition('#')[0]
754 def http_response(self, req, resp):
757 if resp.headers.get('Content-encoding', '') == 'gzip':
758 content = resp.read()
759 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
761 uncompressed = io.BytesIO(gz.read())
762 except IOError as original_ioerror:
763 # There may be junk add the end of the file
764 # See http://stackoverflow.com/q/4928560/35070 for details
765 for i in range(1, 1024):
767 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
768 uncompressed = io.BytesIO(gz.read())
773 raise original_ioerror
774 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
775 resp.msg = old_resp.msg
776 del resp.headers['Content-encoding']
778 if resp.headers.get('Content-encoding', '') == 'deflate':
779 gz = io.BytesIO(self.deflate(resp.read()))
780 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
781 resp.msg = old_resp.msg
782 del resp.headers['Content-encoding']
783 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
784 # https://github.com/rg3/youtube-dl/issues/6457).
785 if 300 <= resp.code < 400:
786 location = resp.headers.get('Location')
788 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
789 if sys.version_info >= (3, 0):
790 location = location.encode('iso-8859-1').decode('utf-8')
791 location_escaped = escape_url(location)
792 if location != location_escaped:
793 del resp.headers['Location']
794 resp.headers['Location'] = location_escaped
797 https_request = http_request
798 https_response = http_response
801 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
802 def __init__(self, params, https_conn_class=None, *args, **kwargs):
803 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
804 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
805 self._params = params
807 def https_open(self, req):
809 if hasattr(self, '_context'): # python > 2.6
810 kwargs['context'] = self._context
811 if hasattr(self, '_check_hostname'): # python 3.x
812 kwargs['check_hostname'] = self._check_hostname
813 return self.do_open(functools.partial(
814 _create_http_connection, self, self._https_conn_class, True),
818 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
819 def __init__(self, cookiejar=None):
820 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
822 def http_response(self, request, response):
823 # Python 2 will choke on next HTTP request in row if there are non-ASCII
824 # characters in Set-Cookie HTTP header of last response (see
825 # https://github.com/rg3/youtube-dl/issues/6769).
826 # In order to at least prevent crashing we will percent encode Set-Cookie
827 # header before HTTPCookieProcessor starts processing it.
828 # if sys.version_info < (3, 0) and response.headers:
829 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
830 # set_cookie = response.headers.get(set_cookie_header)
832 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
833 # if set_cookie != set_cookie_escaped:
834 # del response.headers[set_cookie_header]
835 # response.headers[set_cookie_header] = set_cookie_escaped
836 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
838 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
839 https_response = http_response
842 def parse_iso8601(date_str, delimiter='T', timezone=None):
843 """ Return a UNIX timestamp from the given date """
848 date_str = re.sub(r'\.[0-9]+', '', date_str)
852 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
855 timezone = datetime.timedelta()
857 date_str = date_str[:-len(m.group(0))]
858 if not m.group('sign'):
859 timezone = datetime.timedelta()
861 sign = 1 if m.group('sign') == '+' else -1
862 timezone = datetime.timedelta(
863 hours=sign * int(m.group('hours')),
864 minutes=sign * int(m.group('minutes')))
866 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
867 dt = datetime.datetime.strptime(date_str, date_format) - timezone
868 return calendar.timegm(dt.timetuple())
873 def unified_strdate(date_str, day_first=True):
874 """Return a string with the date in the format YYYYMMDD"""
880 date_str = date_str.replace(',', ' ')
881 # %z (UTC offset) is only supported in python>=3.2
882 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
883 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
884 # Remove AM/PM + timezone
885 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
887 format_expressions = [
892 '%b %dst %Y %I:%M%p',
893 '%b %dnd %Y %I:%M%p',
894 '%b %dth %Y %I:%M%p',
900 '%Y-%m-%d %H:%M:%S.%f',
903 '%Y-%m-%dT%H:%M:%SZ',
904 '%Y-%m-%dT%H:%M:%S.%fZ',
905 '%Y-%m-%dT%H:%M:%S.%f0Z',
907 '%Y-%m-%dT%H:%M:%S.%f',
911 format_expressions.extend([
919 format_expressions.extend([
926 for expression in format_expressions:
928 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
931 if upload_date is None:
932 timetuple = email.utils.parsedate_tz(date_str)
934 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
935 if upload_date is not None:
936 return compat_str(upload_date)
939 def determine_ext(url, default_ext='unknown_video'):
942 guess = url.partition('?')[0].rpartition('.')[2]
943 if re.match(r'^[A-Za-z0-9]+$', guess):
945 elif guess.rstrip('/') in (
946 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
947 'flv', 'f4v', 'f4a', 'f4b',
948 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
949 'mkv', 'mka', 'mk3d',
958 'f4f', 'f4m', 'm3u8', 'smil'):
959 return guess.rstrip('/')
964 def subtitles_filename(filename, sub_lang, sub_format):
965 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
968 def date_from_str(date_str):
970 Return a datetime object from a string in the format YYYYMMDD or
971 (now|today)[+-][0-9](day|week|month|year)(s)?"""
972 today = datetime.date.today()
973 if date_str in ('now', 'today'):
975 if date_str == 'yesterday':
976 return today - datetime.timedelta(days=1)
977 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
978 if match is not None:
979 sign = match.group('sign')
980 time = int(match.group('time'))
983 unit = match.group('unit')
984 # A bad aproximation?
992 delta = datetime.timedelta(**{unit: time})
994 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
997 def hyphenate_date(date_str):
999 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1000 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1001 if match is not None:
1002 return '-'.join(match.groups())
1007 class DateRange(object):
1008 """Represents a time interval between two dates"""
1010 def __init__(self, start=None, end=None):
1011 """start and end must be strings in the format accepted by date"""
1012 if start is not None:
1013 self.start = date_from_str(start)
1015 self.start = datetime.datetime.min.date()
1017 self.end = date_from_str(end)
1019 self.end = datetime.datetime.max.date()
1020 if self.start > self.end:
1021 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1025 """Returns a range that only contains the given day"""
1026 return cls(day, day)
1028 def __contains__(self, date):
1029 """Check if the date is in the range"""
1030 if not isinstance(date, datetime.date):
1031 date = date_from_str(date)
1032 return self.start <= date <= self.end
1035 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1038 def platform_name():
1039 """ Returns the platform name as a compat_str """
1040 res = platform.platform()
1041 if isinstance(res, bytes):
1042 res = res.decode(preferredencoding())
1044 assert isinstance(res, compat_str)
1048 def _windows_write_string(s, out):
1049 """ Returns True if the string was written using special methods,
1050 False if it has yet to be written out."""
1051 # Adapted from http://stackoverflow.com/a/3259271/35070
1054 import ctypes.wintypes
1062 fileno = out.fileno()
1063 except AttributeError:
1064 # If the output stream doesn't have a fileno, it's virtual
1066 except io.UnsupportedOperation:
1067 # Some strange Windows pseudo files?
1069 if fileno not in WIN_OUTPUT_IDS:
1072 GetStdHandle = ctypes.WINFUNCTYPE(
1073 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1074 (b"GetStdHandle", ctypes.windll.kernel32))
1075 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1077 WriteConsoleW = ctypes.WINFUNCTYPE(
1078 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1079 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1080 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1081 written = ctypes.wintypes.DWORD(0)
1083 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1084 FILE_TYPE_CHAR = 0x0002
1085 FILE_TYPE_REMOTE = 0x8000
1086 GetConsoleMode = ctypes.WINFUNCTYPE(
1087 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1088 ctypes.POINTER(ctypes.wintypes.DWORD))(
1089 (b"GetConsoleMode", ctypes.windll.kernel32))
1090 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1092 def not_a_console(handle):
1093 if handle == INVALID_HANDLE_VALUE or handle is None:
1095 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1096 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1098 if not_a_console(h):
1101 def next_nonbmp_pos(s):
1103 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1104 except StopIteration:
1108 count = min(next_nonbmp_pos(s), 1024)
1110 ret = WriteConsoleW(
1111 h, s, count if count else 2, ctypes.byref(written), None)
1113 raise OSError('Failed to write string')
1114 if not count: # We just wrote a non-BMP character
1115 assert written.value == 2
1118 assert written.value > 0
1119 s = s[written.value:]
1123 def write_string(s, out=None, encoding=None):
1126 assert type(s) == compat_str
1128 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1129 if _windows_write_string(s, out):
1132 if ('b' in getattr(out, 'mode', '') or
1133 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1134 byt = s.encode(encoding or preferredencoding(), 'ignore')
1136 elif hasattr(out, 'buffer'):
1137 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1138 byt = s.encode(enc, 'ignore')
1139 out.buffer.write(byt)
1145 def bytes_to_intlist(bs):
1148 if isinstance(bs[0], int): # Python 3
1151 return [ord(c) for c in bs]
1154 def intlist_to_bytes(xs):
1157 return struct_pack('%dB' % len(xs), *xs)
1160 # Cross-platform file locking
1161 if sys.platform == 'win32':
1162 import ctypes.wintypes
1165 class OVERLAPPED(ctypes.Structure):
1167 ('Internal', ctypes.wintypes.LPVOID),
1168 ('InternalHigh', ctypes.wintypes.LPVOID),
1169 ('Offset', ctypes.wintypes.DWORD),
1170 ('OffsetHigh', ctypes.wintypes.DWORD),
1171 ('hEvent', ctypes.wintypes.HANDLE),
1174 kernel32 = ctypes.windll.kernel32
1175 LockFileEx = kernel32.LockFileEx
1176 LockFileEx.argtypes = [
1177 ctypes.wintypes.HANDLE, # hFile
1178 ctypes.wintypes.DWORD, # dwFlags
1179 ctypes.wintypes.DWORD, # dwReserved
1180 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1181 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1182 ctypes.POINTER(OVERLAPPED) # Overlapped
1184 LockFileEx.restype = ctypes.wintypes.BOOL
1185 UnlockFileEx = kernel32.UnlockFileEx
1186 UnlockFileEx.argtypes = [
1187 ctypes.wintypes.HANDLE, # hFile
1188 ctypes.wintypes.DWORD, # dwReserved
1189 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1190 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1191 ctypes.POINTER(OVERLAPPED) # Overlapped
1193 UnlockFileEx.restype = ctypes.wintypes.BOOL
1194 whole_low = 0xffffffff
1195 whole_high = 0x7fffffff
1197 def _lock_file(f, exclusive):
1198 overlapped = OVERLAPPED()
1199 overlapped.Offset = 0
1200 overlapped.OffsetHigh = 0
1201 overlapped.hEvent = 0
1202 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1203 handle = msvcrt.get_osfhandle(f.fileno())
1204 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1205 whole_low, whole_high, f._lock_file_overlapped_p):
1206 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1208 def _unlock_file(f):
1209 assert f._lock_file_overlapped_p
1210 handle = msvcrt.get_osfhandle(f.fileno())
1211 if not UnlockFileEx(handle, 0,
1212 whole_low, whole_high, f._lock_file_overlapped_p):
1213 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1218 def _lock_file(f, exclusive):
1219 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1221 def _unlock_file(f):
1222 fcntl.flock(f, fcntl.LOCK_UN)
1225 class locked_file(object):
1226 def __init__(self, filename, mode, encoding=None):
1227 assert mode in ['r', 'a', 'w']
1228 self.f = io.open(filename, mode, encoding=encoding)
1231 def __enter__(self):
1232 exclusive = self.mode != 'r'
1234 _lock_file(self.f, exclusive)
1240 def __exit__(self, etype, value, traceback):
1242 _unlock_file(self.f)
1249 def write(self, *args):
1250 return self.f.write(*args)
1252 def read(self, *args):
1253 return self.f.read(*args)
1256 def get_filesystem_encoding():
1257 encoding = sys.getfilesystemencoding()
1258 return encoding if encoding is not None else 'utf-8'
1261 def shell_quote(args):
1263 encoding = get_filesystem_encoding()
1265 if isinstance(a, bytes):
1266 # We may get a filename encoded with 'encodeFilename'
1267 a = a.decode(encoding)
1268 quoted_args.append(pipes.quote(a))
1269 return ' '.join(quoted_args)
1272 def smuggle_url(url, data):
1273 """ Pass additional data in a URL for internal use. """
1275 sdata = compat_urllib_parse.urlencode(
1276 {'__youtubedl_smuggle': json.dumps(data)})
1277 return url + '#' + sdata
1280 def unsmuggle_url(smug_url, default=None):
1281 if '#__youtubedl_smuggle' not in smug_url:
1282 return smug_url, default
1283 url, _, sdata = smug_url.rpartition('#')
1284 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1285 data = json.loads(jsond)
1289 def format_bytes(bytes):
1292 if type(bytes) is str:
1293 bytes = float(bytes)
1297 exponent = int(math.log(bytes, 1024.0))
1298 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1299 converted = float(bytes) / float(1024 ** exponent)
1300 return '%.2f%s' % (converted, suffix)
1303 def parse_filesize(s):
1307 # The lower-case forms are of course incorrect and inofficial,
1308 # but we support those too
1346 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1348 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1352 num_str = m.group('num').replace(',', '.')
1353 mult = _UNIT_TABLE[m.group('unit')]
1354 return int(float(num_str) * mult)
1357 def month_by_name(name):
1358 """ Return the number of a month by (locale-independently) English name """
1361 return ENGLISH_MONTH_NAMES.index(name) + 1
1366 def month_by_abbreviation(abbrev):
1367 """ Return the number of a month by (locale-independently) English
1371 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1376 def fix_xml_ampersands(xml_str):
1377 """Replace all the '&' by '&' in XML"""
1379 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1384 def setproctitle(title):
1385 assert isinstance(title, compat_str)
1387 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1390 title_bytes = title.encode('utf-8')
1391 buf = ctypes.create_string_buffer(len(title_bytes))
1392 buf.value = title_bytes
1394 libc.prctl(15, buf, 0, 0, 0)
1395 except AttributeError:
1396 return # Strange libc, just skip this
1399 def remove_start(s, start):
1400 if s.startswith(start):
1401 return s[len(start):]
1405 def remove_end(s, end):
1407 return s[:-len(end)]
1411 def remove_quotes(s):
1412 if s is None or len(s) < 2:
1414 for quote in ('"', "'", ):
1415 if s[0] == quote and s[-1] == quote:
1420 def url_basename(url):
1421 path = compat_urlparse.urlparse(url).path
1422 return path.strip('/').split('/')[-1]
1425 class HEADRequest(compat_urllib_request.Request):
1426 def get_method(self):
1430 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1433 v = getattr(v, get_attr, None)
1439 return int(v) * invscale // scale
1444 def str_or_none(v, default=None):
1445 return default if v is None else compat_str(v)
1448 def str_to_int(int_str):
1449 """ A more relaxed version of int_or_none """
1452 int_str = re.sub(r'[,\.\+]', '', int_str)
1456 def float_or_none(v, scale=1, invscale=1, default=None):
1460 return float(v) * invscale / scale
1465 def parse_duration(s):
1466 if not isinstance(s, compat_basestring):
1474 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1475 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1477 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1480 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1481 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1483 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1485 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1490 if m.group('only_mins'):
1491 return float_or_none(m.group('only_mins'), invscale=60)
1492 if m.group('only_hours'):
1493 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1495 res += int(m.group('secs'))
1496 if m.group('mins_reversed'):
1497 res += int(m.group('mins_reversed')) * 60
1499 res += int(m.group('mins')) * 60
1500 if m.group('hours'):
1501 res += int(m.group('hours')) * 60 * 60
1502 if m.group('hours_reversed'):
1503 res += int(m.group('hours_reversed')) * 60 * 60
1505 res += int(m.group('days')) * 24 * 60 * 60
1507 res += float(m.group('ms'))
1511 def prepend_extension(filename, ext, expected_real_ext=None):
1512 name, real_ext = os.path.splitext(filename)
1514 '{0}.{1}{2}'.format(name, ext, real_ext)
1515 if not expected_real_ext or real_ext[1:] == expected_real_ext
1516 else '{0}.{1}'.format(filename, ext))
1519 def replace_extension(filename, ext, expected_real_ext=None):
1520 name, real_ext = os.path.splitext(filename)
1521 return '{0}.{1}'.format(
1522 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1526 def check_executable(exe, args=[]):
1527 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1528 args can be a list of arguments for a short output (like -version) """
1530 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1536 def get_exe_version(exe, args=['--version'],
1537 version_re=None, unrecognized='present'):
1538 """ Returns the version of the specified executable,
1539 or False if the executable is not present """
1541 out, _ = subprocess.Popen(
1542 [encodeArgument(exe)] + args,
1543 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1546 if isinstance(out, bytes): # Python 2.x
1547 out = out.decode('ascii', 'ignore')
1548 return detect_exe_version(out, version_re, unrecognized)
1551 def detect_exe_version(output, version_re=None, unrecognized='present'):
1552 assert isinstance(output, compat_str)
1553 if version_re is None:
1554 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1555 m = re.search(version_re, output)
1562 class PagedList(object):
1564 # This is only useful for tests
1565 return len(self.getslice())
1568 class OnDemandPagedList(PagedList):
1569 def __init__(self, pagefunc, pagesize):
1570 self._pagefunc = pagefunc
1571 self._pagesize = pagesize
1573 def getslice(self, start=0, end=None):
1575 for pagenum in itertools.count(start // self._pagesize):
1576 firstid = pagenum * self._pagesize
1577 nextfirstid = pagenum * self._pagesize + self._pagesize
1578 if start >= nextfirstid:
1581 page_results = list(self._pagefunc(pagenum))
1584 start % self._pagesize
1585 if firstid <= start < nextfirstid
1589 ((end - 1) % self._pagesize) + 1
1590 if (end is not None and firstid <= end <= nextfirstid)
1593 if startv != 0 or endv is not None:
1594 page_results = page_results[startv:endv]
1595 res.extend(page_results)
1597 # A little optimization - if current page is not "full", ie. does
1598 # not contain page_size videos then we can assume that this page
1599 # is the last one - there are no more ids on further pages -
1600 # i.e. no need to query again.
1601 if len(page_results) + startv < self._pagesize:
1604 # If we got the whole page, but the next page is not interesting,
1605 # break out early as well
1606 if end == nextfirstid:
1611 class InAdvancePagedList(PagedList):
1612 def __init__(self, pagefunc, pagecount, pagesize):
1613 self._pagefunc = pagefunc
1614 self._pagecount = pagecount
1615 self._pagesize = pagesize
1617 def getslice(self, start=0, end=None):
1619 start_page = start // self._pagesize
1621 self._pagecount if end is None else (end // self._pagesize + 1))
1622 skip_elems = start - start_page * self._pagesize
1623 only_more = None if end is None else end - start
1624 for pagenum in range(start_page, end_page):
1625 page = list(self._pagefunc(pagenum))
1627 page = page[skip_elems:]
1629 if only_more is not None:
1630 if len(page) < only_more:
1631 only_more -= len(page)
1633 page = page[:only_more]
1640 def uppercase_escape(s):
1641 unicode_escape = codecs.getdecoder('unicode_escape')
1643 r'\\U[0-9a-fA-F]{8}',
1644 lambda m: unicode_escape(m.group(0))[0],
1648 def lowercase_escape(s):
1649 unicode_escape = codecs.getdecoder('unicode_escape')
1651 r'\\u[0-9a-fA-F]{4}',
1652 lambda m: unicode_escape(m.group(0))[0],
1656 def escape_rfc3986(s):
1657 """Escape non-ASCII characters as suggested by RFC 3986"""
1658 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1659 s = s.encode('utf-8')
1660 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1663 def escape_url(url):
1664 """Escape URL as suggested by RFC 3986"""
1665 url_parsed = compat_urllib_parse_urlparse(url)
1666 return url_parsed._replace(
1667 path=escape_rfc3986(url_parsed.path),
1668 params=escape_rfc3986(url_parsed.params),
1669 query=escape_rfc3986(url_parsed.query),
1670 fragment=escape_rfc3986(url_parsed.fragment)
1674 struct.pack('!I', 0)
1676 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1677 def struct_pack(spec, *args):
1678 if isinstance(spec, compat_str):
1679 spec = spec.encode('ascii')
1680 return struct.pack(spec, *args)
1682 def struct_unpack(spec, *args):
1683 if isinstance(spec, compat_str):
1684 spec = spec.encode('ascii')
1685 return struct.unpack(spec, *args)
1687 struct_pack = struct.pack
1688 struct_unpack = struct.unpack
1691 def read_batch_urls(batch_fd):
1693 if not isinstance(url, compat_str):
1694 url = url.decode('utf-8', 'replace')
1695 BOM_UTF8 = '\xef\xbb\xbf'
1696 if url.startswith(BOM_UTF8):
1697 url = url[len(BOM_UTF8):]
1699 if url.startswith(('#', ';', ']')):
1703 with contextlib.closing(batch_fd) as fd:
1704 return [url for url in map(fixup, fd) if url]
1707 def urlencode_postdata(*args, **kargs):
1708 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1711 def encode_dict(d, encoding='utf-8'):
1713 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1714 return dict((encode(k), encode(v)) for k, v in d.items())
1717 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1718 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1730 def parse_age_limit(s):
1733 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1734 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1737 def strip_jsonp(code):
1739 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1742 def js_to_json(code):
1745 if v in ('true', 'false', 'null'):
1747 if v.startswith('"'):
1748 v = re.sub(r"\\'", "'", v[1:-1])
1749 elif v.startswith("'"):
1751 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1758 res = re.sub(r'''(?x)
1759 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1760 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1761 [a-zA-Z_][.a-zA-Z_0-9]*
1763 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1767 def qualities(quality_ids):
1768 """ Get a numeric quality value out of a list of possible values """
1771 return quality_ids.index(qid)
1777 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1780 def limit_length(s, length):
1781 """ Add ellipses to overly long strings """
1786 return s[:length - len(ELLIPSES)] + ELLIPSES
1790 def version_tuple(v):
1791 return tuple(int(e) for e in re.split(r'[-.]', v))
1794 def is_outdated_version(version, limit, assume_new=True):
1796 return not assume_new
1798 return version_tuple(version) < version_tuple(limit)
1800 return not assume_new
1803 def ytdl_is_updateable():
1804 """ Returns if youtube-dl can be updated with -U """
1805 from zipimport import zipimporter
1807 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1810 def args_to_str(args):
1811 # Get a short string representation for a subprocess command
1812 return ' '.join(shlex_quote(a) for a in args)
1815 def error_to_compat_str(err):
1817 # On python 2 error byte string must be decoded with proper
1818 # encoding rather than ascii
1819 if sys.version_info[0] < 3:
1820 err_str = err_str.decode(preferredencoding())
1824 def mimetype2ext(mt):
1825 _, _, res = mt.rpartition('/')
1829 'x-mp4-fragmented': 'mp4',
1834 def urlhandle_detect_ext(url_handle):
1837 getheader = lambda h: url_handle.headers[h]
1838 except AttributeError: # Python < 3
1839 getheader = url_handle.info().getheader
1841 cd = getheader('Content-Disposition')
1843 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1845 e = determine_ext(m.group('filename'), default_ext=None)
1849 return mimetype2ext(getheader('Content-Type'))
1852 def encode_data_uri(data, mime_type):
1853 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1856 def age_restricted(content_limit, age_limit):
1857 """ Returns True iff the content should be blocked """
1859 if age_limit is None: # No limit set
1861 if content_limit is None:
1862 return False # Content available for everyone
1863 return age_limit < content_limit
1866 def is_html(first_bytes):
1867 """ Detect whether a file contains HTML by examining its first bytes. """
1870 (b'\xef\xbb\xbf', 'utf-8'),
1871 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1872 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1873 (b'\xff\xfe', 'utf-16-le'),
1874 (b'\xfe\xff', 'utf-16-be'),
1876 for bom, enc in BOMS:
1877 if first_bytes.startswith(bom):
1878 s = first_bytes[len(bom):].decode(enc, 'replace')
1881 s = first_bytes.decode('utf-8', 'replace')
1883 return re.match(r'^\s*<', s)
1886 def determine_protocol(info_dict):
1887 protocol = info_dict.get('protocol')
1888 if protocol is not None:
1891 url = info_dict['url']
1892 if url.startswith('rtmp'):
1894 elif url.startswith('mms'):
1896 elif url.startswith('rtsp'):
1899 ext = determine_ext(url)
1905 return compat_urllib_parse_urlparse(url).scheme
1908 def render_table(header_row, data):
1909 """ Render a list of rows, each as a list of values """
1910 table = [header_row] + data
1911 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1912 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1913 return '\n'.join(format_str % tuple(row) for row in table)
1916 def _match_one(filter_part, dct):
1917 COMPARISON_OPERATORS = {
1925 operator_rex = re.compile(r'''(?x)\s*
1927 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1929 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1930 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1933 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1934 m = operator_rex.search(filter_part)
1936 op = COMPARISON_OPERATORS[m.group('op')]
1937 if m.group('strval') is not None:
1938 if m.group('op') not in ('=', '!='):
1940 'Operator %s does not support string values!' % m.group('op'))
1941 comparison_value = m.group('strval')
1944 comparison_value = int(m.group('intval'))
1946 comparison_value = parse_filesize(m.group('intval'))
1947 if comparison_value is None:
1948 comparison_value = parse_filesize(m.group('intval') + 'B')
1949 if comparison_value is None:
1951 'Invalid integer value %r in filter part %r' % (
1952 m.group('intval'), filter_part))
1953 actual_value = dct.get(m.group('key'))
1954 if actual_value is None:
1955 return m.group('none_inclusive')
1956 return op(actual_value, comparison_value)
1959 '': lambda v: v is not None,
1960 '!': lambda v: v is None,
1962 operator_rex = re.compile(r'''(?x)\s*
1963 (?P<op>%s)\s*(?P<key>[a-z_]+)
1965 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1966 m = operator_rex.search(filter_part)
1968 op = UNARY_OPERATORS[m.group('op')]
1969 actual_value = dct.get(m.group('key'))
1970 return op(actual_value)
1972 raise ValueError('Invalid filter part %r' % filter_part)
1975 def match_str(filter_str, dct):
1976 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1979 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1982 def match_filter_func(filter_str):
1983 def _match_func(info_dict):
1984 if match_str(filter_str, info_dict):
1987 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1988 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1992 def parse_dfxp_time_expr(time_expr):
1996 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1998 return float(mobj.group('time_offset'))
2000 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2002 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2005 def srt_subtitles_timecode(seconds):
2006 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2009 def dfxp2srt(dfxp_data):
2010 _x = functools.partial(xpath_with_ns, ns_map={
2011 'ttml': 'http://www.w3.org/ns/ttml',
2012 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2015 def parse_node(node):
2016 str_or_empty = functools.partial(str_or_none, default='')
2018 out = str_or_empty(node.text)
2021 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2022 out += '\n' + str_or_empty(child.tail)
2023 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
2024 out += str_or_empty(parse_node(child))
2026 out += str_or_empty(xml.etree.ElementTree.tostring(child))
2030 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2032 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2035 raise ValueError('Invalid dfxp/TTML subtitle')
2037 for para, index in zip(paras, itertools.count(1)):
2038 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2039 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2040 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2041 if begin_time is None:
2046 end_time = begin_time + dur
2047 out.append('%d\n%s --> %s\n%s\n\n' % (
2049 srt_subtitles_timecode(begin_time),
2050 srt_subtitles_timecode(end_time),
2056 def cli_option(params, command_option, param):
2057 param = params.get(param)
2058 return [command_option, param] if param is not None else []
2061 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2062 param = params.get(param)
2063 assert isinstance(param, bool)
2065 return [command_option + separator + (true_value if param else false_value)]
2066 return [command_option, true_value if param else false_value]
2069 def cli_valueless_option(params, command_option, param, expected_value=True):
2070 param = params.get(param)
2071 return [command_option] if param == expected_value else []
2074 def cli_configuration_args(params, param, default=[]):
2075 ex_args = params.get(param)
2078 assert isinstance(ex_args, list)
2082 class ISO639Utils(object):
2083 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2272 def short2long(cls, code):
2273 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2274 return cls._lang_map.get(code[:2])
2277 def long2short(cls, code):
2278 """Convert language code from ISO 639-2/T to ISO 639-1"""
2279 for short_name, long_name in cls._lang_map.items():
2280 if long_name == code:
2284 class ISO3166Utils(object):
2285 # From http://data.okfn.org/data/core/country-list
2287 'AF': 'Afghanistan',
2288 'AX': 'Ã…land Islands',
2291 'AS': 'American Samoa',
2296 'AG': 'Antigua and Barbuda',
2313 'BO': 'Bolivia, Plurinational State of',
2314 'BQ': 'Bonaire, Sint Eustatius and Saba',
2315 'BA': 'Bosnia and Herzegovina',
2317 'BV': 'Bouvet Island',
2319 'IO': 'British Indian Ocean Territory',
2320 'BN': 'Brunei Darussalam',
2322 'BF': 'Burkina Faso',
2328 'KY': 'Cayman Islands',
2329 'CF': 'Central African Republic',
2333 'CX': 'Christmas Island',
2334 'CC': 'Cocos (Keeling) Islands',
2338 'CD': 'Congo, the Democratic Republic of the',
2339 'CK': 'Cook Islands',
2341 'CI': 'Côte d\'Ivoire',
2346 'CZ': 'Czech Republic',
2350 'DO': 'Dominican Republic',
2353 'SV': 'El Salvador',
2354 'GQ': 'Equatorial Guinea',
2358 'FK': 'Falkland Islands (Malvinas)',
2359 'FO': 'Faroe Islands',
2363 'GF': 'French Guiana',
2364 'PF': 'French Polynesia',
2365 'TF': 'French Southern Territories',
2380 'GW': 'Guinea-Bissau',
2383 'HM': 'Heard Island and McDonald Islands',
2384 'VA': 'Holy See (Vatican City State)',
2391 'IR': 'Iran, Islamic Republic of',
2394 'IM': 'Isle of Man',
2404 'KP': 'Korea, Democratic People\'s Republic of',
2405 'KR': 'Korea, Republic of',
2408 'LA': 'Lao People\'s Democratic Republic',
2414 'LI': 'Liechtenstein',
2418 'MK': 'Macedonia, the Former Yugoslav Republic of',
2425 'MH': 'Marshall Islands',
2431 'FM': 'Micronesia, Federated States of',
2432 'MD': 'Moldova, Republic of',
2443 'NL': 'Netherlands',
2444 'NC': 'New Caledonia',
2445 'NZ': 'New Zealand',
2450 'NF': 'Norfolk Island',
2451 'MP': 'Northern Mariana Islands',
2456 'PS': 'Palestine, State of',
2458 'PG': 'Papua New Guinea',
2461 'PH': 'Philippines',
2465 'PR': 'Puerto Rico',
2469 'RU': 'Russian Federation',
2471 'BL': 'Saint Barthélemy',
2472 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2473 'KN': 'Saint Kitts and Nevis',
2474 'LC': 'Saint Lucia',
2475 'MF': 'Saint Martin (French part)',
2476 'PM': 'Saint Pierre and Miquelon',
2477 'VC': 'Saint Vincent and the Grenadines',
2480 'ST': 'Sao Tome and Principe',
2481 'SA': 'Saudi Arabia',
2485 'SL': 'Sierra Leone',
2487 'SX': 'Sint Maarten (Dutch part)',
2490 'SB': 'Solomon Islands',
2492 'ZA': 'South Africa',
2493 'GS': 'South Georgia and the South Sandwich Islands',
2494 'SS': 'South Sudan',
2499 'SJ': 'Svalbard and Jan Mayen',
2502 'CH': 'Switzerland',
2503 'SY': 'Syrian Arab Republic',
2504 'TW': 'Taiwan, Province of China',
2506 'TZ': 'Tanzania, United Republic of',
2508 'TL': 'Timor-Leste',
2512 'TT': 'Trinidad and Tobago',
2515 'TM': 'Turkmenistan',
2516 'TC': 'Turks and Caicos Islands',
2520 'AE': 'United Arab Emirates',
2521 'GB': 'United Kingdom',
2522 'US': 'United States',
2523 'UM': 'United States Minor Outlying Islands',
2527 'VE': 'Venezuela, Bolivarian Republic of',
2529 'VG': 'Virgin Islands, British',
2530 'VI': 'Virgin Islands, U.S.',
2531 'WF': 'Wallis and Futuna',
2532 'EH': 'Western Sahara',
2539 def short2full(cls, code):
2540 """Convert an ISO 3166-2 country code to the corresponding full name"""
2541 return cls._country_map.get(code.upper())
2544 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2545 def __init__(self, proxies=None):
2546 # Set default handlers
2547 for type in ('http', 'https'):
2548 setattr(self, '%s_open' % type,
2549 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2550 meth(r, proxy, type))
2551 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2553 def proxy_open(self, req, proxy, type):
2554 req_proxy = req.headers.get('Ytdl-request-proxy')
2555 if req_proxy is not None:
2557 del req.headers['Ytdl-request-proxy']
2559 if proxy == '__noproxy__':
2560 return None # No Proxy
2561 return compat_urllib_request.ProxyHandler.proxy_open(
2562 self, req, proxy, type)