2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
39 compat_etree_fromstring,
44 compat_socket_create_connection,
48 compat_urllib_parse_urlparse,
49 compat_urllib_request,
55 # This is not clearly defined otherwise
56 compiled_regex_type = type(re.compile(''))
59 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
60 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
61 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
62 'Accept-Encoding': 'gzip, deflate',
63 'Accept-Language': 'en-us,en;q=0.5',
69 ENGLISH_MONTH_NAMES = [
70 'January', 'February', 'March', 'April', 'May', 'June',
71 'July', 'August', 'September', 'October', 'November', 'December']
74 def preferredencoding():
75 """Get preferred encoding.
77 Returns the best encoding scheme for the system, based on
78 locale.getpreferredencoding() and some further tweaks.
81 pref = locale.getpreferredencoding()
89 def write_json_file(obj, fn):
90 """ Encode obj as JSON and write it to fn, atomically if possible """
92 fn = encodeFilename(fn)
93 if sys.version_info < (3, 0) and sys.platform != 'win32':
94 encoding = get_filesystem_encoding()
95 # os.path.basename returns a bytes object, but NamedTemporaryFile
96 # will fail if the filename contains non ascii characters unless we
97 # use a unicode object
98 path_basename = lambda f: os.path.basename(fn).decode(encoding)
99 # the same for os.path.dirname
100 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
102 path_basename = os.path.basename
103 path_dirname = os.path.dirname
107 'prefix': path_basename(fn) + '.',
108 'dir': path_dirname(fn),
112 # In Python 2.x, json.dump expects a bytestream.
113 # In Python 3.x, it writes to a character stream
114 if sys.version_info < (3, 0):
122 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
127 if sys.platform == 'win32':
128 # Need to remove existing file on Windows, else os.rename raises
129 # WindowsError or FileExistsError.
134 os.rename(tf.name, fn)
143 if sys.version_info >= (2, 7):
144 def find_xpath_attr(node, xpath, key, val=None):
145 """ Find the xpath xpath[@key=val] """
146 assert re.match(r'^[a-zA-Z_-]+$', key)
148 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
149 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
150 return node.find(expr)
152 def find_xpath_attr(node, xpath, key, val=None):
153 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
154 # .//node does not match if a node is a direct child of . !
155 if isinstance(xpath, compat_str):
156 xpath = xpath.encode('ascii')
158 for f in node.findall(xpath):
159 if key not in f.attrib:
161 if val is None or f.attrib.get(key) == val:
165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
166 # the namespace parameter
169 def xpath_with_ns(path, ns_map):
170 components = [c.split(':') for c in path.split('/')]
174 replaced.append(c[0])
177 replaced.append('{%s}%s' % (ns_map[ns], tag))
178 return '/'.join(replaced)
181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
182 def _find_xpath(xpath):
183 if sys.version_info < (2, 7): # Crazy 2.6
184 xpath = xpath.encode('ascii')
185 return node.find(xpath)
187 if isinstance(xpath, (str, compat_str)):
188 n = _find_xpath(xpath)
196 if default is not NO_DEFAULT:
199 name = xpath if name is None else name
200 raise ExtractorError('Could not find XML element %s' % name)
206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
207 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
208 if n is None or n == default:
211 if default is not NO_DEFAULT:
214 name = xpath if name is None else name
215 raise ExtractorError('Could not find XML element\'s text %s' % name)
221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
222 n = find_xpath_attr(node, xpath, key)
224 if default is not NO_DEFAULT:
227 name = '%s[@%s]' % (xpath, key) if name is None else name
228 raise ExtractorError('Could not find XML attribute %s' % name)
234 def get_element_by_id(id, html):
235 """Return the content of the tag with the specified ID in the passed HTML document"""
236 return get_element_by_attribute("id", id, html)
239 def get_element_by_attribute(attribute, value, html):
240 """Return the content of the tag with the specified attribute in the passed HTML document"""
242 m = re.search(r'''(?xs)
244 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
246 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
250 ''' % (re.escape(attribute), re.escape(value)), html)
254 res = m.group('content')
256 if res.startswith('"') or res.startswith("'"):
259 return unescapeHTML(res)
262 def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'):
263 attributes = re.findall(attributes_regex, attributes_str)
266 for (attribute_name, attribute_value) in attributes:
267 attributes_dict[attribute_name] = attribute_value
268 return attributes_dict
271 def clean_html(html):
272 """Clean an HTML snippet into a readable string"""
274 if html is None: # Convenience for sanitizing descriptions etc.
278 html = html.replace('\n', ' ')
279 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
280 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
282 html = re.sub('<.*?>', '', html)
283 # Replace html entities
284 html = unescapeHTML(html)
288 def sanitize_open(filename, open_mode):
289 """Try to open the given filename, and slightly tweak it if this fails.
291 Attempts to open the given filename. If this fails, it tries to change
292 the filename slightly, step by step, until it's either able to open it
293 or it fails and raises a final exception, like the standard open()
296 It returns the tuple (stream, definitive_file_name).
300 if sys.platform == 'win32':
302 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
303 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
304 stream = open(encodeFilename(filename), open_mode)
305 return (stream, filename)
306 except (IOError, OSError) as err:
307 if err.errno in (errno.EACCES,):
310 # In case of error, try to remove win32 forbidden chars
311 alt_filename = sanitize_path(filename)
312 if alt_filename == filename:
315 # An exception here should be caught in the caller
316 stream = open(encodeFilename(alt_filename), open_mode)
317 return (stream, alt_filename)
320 def timeconvert(timestr):
321 """Convert RFC 2822 defined time string into system timestamp"""
323 timetuple = email.utils.parsedate_tz(timestr)
324 if timetuple is not None:
325 timestamp = email.utils.mktime_tz(timetuple)
329 def sanitize_filename(s, restricted=False, is_id=False):
330 """Sanitizes a string so it could be used as part of a filename.
331 If restricted is set, use a stricter subset of allowed characters.
332 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
334 def replace_insane(char):
335 if char == '?' or ord(char) < 32 or ord(char) == 127:
338 return '' if restricted else '\''
340 return '_-' if restricted else ' -'
341 elif char in '\\/|*<>':
343 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
345 if restricted and ord(char) > 127:
350 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
351 result = ''.join(map(replace_insane, s))
353 while '__' in result:
354 result = result.replace('__', '_')
355 result = result.strip('_')
356 # Common case of "Foreign band name - English song title"
357 if restricted and result.startswith('-_'):
359 if result.startswith('-'):
360 result = '_' + result[len('-'):]
361 result = result.lstrip('.')
367 def sanitize_path(s):
368 """Sanitizes and normalizes path on Windows"""
369 if sys.platform != 'win32':
371 drive_or_unc, _ = os.path.splitdrive(s)
372 if sys.version_info < (2, 7) and not drive_or_unc:
373 drive_or_unc, _ = os.path.splitunc(s)
374 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
378 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
379 for path_part in norm_path]
381 sanitized_path.insert(0, drive_or_unc + os.path.sep)
382 return os.path.join(*sanitized_path)
385 def orderedSet(iterable):
386 """ Remove all duplicates from the input iterable """
394 def _htmlentity_transform(entity):
395 """Transforms an HTML entity to a character."""
396 # Known non-numeric HTML entity
397 if entity in compat_html_entities.name2codepoint:
398 return compat_chr(compat_html_entities.name2codepoint[entity])
400 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
402 numstr = mobj.group(1)
403 if numstr.startswith('x'):
405 numstr = '0%s' % numstr
408 return compat_chr(int(numstr, base))
410 # Unknown entity in name, return its literal representation
411 return ('&%s;' % entity)
417 assert type(s) == compat_str
420 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
423 def get_subprocess_encoding():
424 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
425 # For subprocess calls, encode with locale encoding
426 # Refer to http://stackoverflow.com/a/9951851/35070
427 encoding = preferredencoding()
429 encoding = sys.getfilesystemencoding()
435 def encodeFilename(s, for_subprocess=False):
437 @param s The name of the file
440 assert type(s) == compat_str
442 # Python 3 has a Unicode API
443 if sys.version_info >= (3, 0):
446 # Pass '' directly to use Unicode APIs on Windows 2000 and up
447 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
448 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
449 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
452 return s.encode(get_subprocess_encoding(), 'ignore')
455 def decodeFilename(b, for_subprocess=False):
457 if sys.version_info >= (3, 0):
460 if not isinstance(b, bytes):
463 return b.decode(get_subprocess_encoding(), 'ignore')
466 def encodeArgument(s):
467 if not isinstance(s, compat_str):
468 # Legacy code that uses byte strings
469 # Uncomment the following line after fixing all post processors
470 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
471 s = s.decode('ascii')
472 return encodeFilename(s, True)
475 def decodeArgument(b):
476 return decodeFilename(b, True)
479 def decodeOption(optval):
482 if isinstance(optval, bytes):
483 optval = optval.decode(preferredencoding())
485 assert isinstance(optval, compat_str)
489 def formatSeconds(secs):
491 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
493 return '%d:%02d' % (secs // 60, secs % 60)
498 def make_HTTPS_handler(params, **kwargs):
499 opts_no_check_certificate = params.get('nocheckcertificate', False)
500 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
501 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
502 if opts_no_check_certificate:
503 context.check_hostname = False
504 context.verify_mode = ssl.CERT_NONE
506 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
509 # (create_default_context present but HTTPSHandler has no context=)
512 if sys.version_info < (3, 2):
513 return YoutubeDLHTTPSHandler(params, **kwargs)
515 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
516 context.verify_mode = (ssl.CERT_NONE
517 if opts_no_check_certificate
518 else ssl.CERT_REQUIRED)
519 context.set_default_verify_paths()
520 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
523 def bug_reports_message():
524 if ytdl_is_updateable():
525 update_cmd = 'type youtube-dl -U to update'
527 update_cmd = 'see https://yt-dl.org/update on how to update'
528 msg = '; please report this issue on https://yt-dl.org/bug .'
529 msg += ' Make sure you are using the latest version; %s.' % update_cmd
530 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
534 class ExtractorError(Exception):
535 """Error during info extraction."""
537 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
538 """ tb, if given, is the original traceback (so that it can be printed out).
539 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
542 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
544 if video_id is not None:
545 msg = video_id + ': ' + msg
547 msg += ' (caused by %r)' % cause
549 msg += bug_reports_message()
550 super(ExtractorError, self).__init__(msg)
553 self.exc_info = sys.exc_info() # preserve original exception
555 self.video_id = video_id
557 def format_traceback(self):
558 if self.traceback is None:
560 return ''.join(traceback.format_tb(self.traceback))
563 class UnsupportedError(ExtractorError):
564 def __init__(self, url):
565 super(UnsupportedError, self).__init__(
566 'Unsupported URL: %s' % url, expected=True)
570 class RegexNotFoundError(ExtractorError):
571 """Error when a regex didn't match"""
575 class DownloadError(Exception):
576 """Download Error exception.
578 This exception may be thrown by FileDownloader objects if they are not
579 configured to continue on errors. They will contain the appropriate
583 def __init__(self, msg, exc_info=None):
584 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
585 super(DownloadError, self).__init__(msg)
586 self.exc_info = exc_info
589 class SameFileError(Exception):
590 """Same File exception.
592 This exception will be thrown by FileDownloader objects if they detect
593 multiple files would have to be downloaded to the same file on disk.
598 class PostProcessingError(Exception):
599 """Post Processing exception.
601 This exception may be raised by PostProcessor's .run() method to
602 indicate an error in the postprocessing task.
605 def __init__(self, msg):
609 class MaxDownloadsReached(Exception):
610 """ --max-downloads limit has been reached. """
614 class UnavailableVideoError(Exception):
615 """Unavailable Format exception.
617 This exception will be thrown when a video is requested
618 in a format that is not available for that video.
623 class ContentTooShortError(Exception):
624 """Content Too Short exception.
626 This exception may be raised by FileDownloader objects when a file they
627 download is too small for what the server announced first, indicating
628 the connection was probably interrupted.
631 def __init__(self, downloaded, expected):
633 self.downloaded = downloaded
634 self.expected = expected
637 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
638 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
639 # expected HTTP responses to meet HTTP/1.0 or later (see also
640 # https://github.com/rg3/youtube-dl/issues/6727)
641 if sys.version_info < (3, 0):
642 kwargs[b'strict'] = True
643 hc = http_class(*args, **kwargs)
644 source_address = ydl_handler._params.get('source_address')
645 if source_address is not None:
646 sa = (source_address, 0)
647 if hasattr(hc, 'source_address'): # Python 2.7+
648 hc.source_address = sa
650 def _hc_connect(self, *args, **kwargs):
651 sock = compat_socket_create_connection(
652 (self.host, self.port), self.timeout, sa)
654 self.sock = ssl.wrap_socket(
655 sock, self.key_file, self.cert_file,
656 ssl_version=ssl.PROTOCOL_TLSv1)
659 hc.connect = functools.partial(_hc_connect, hc)
664 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
665 """Handler for HTTP requests and responses.
667 This class, when installed with an OpenerDirector, automatically adds
668 the standard headers to every HTTP request and handles gzipped and
669 deflated responses from web servers. If compression is to be avoided in
670 a particular request, the original request in the program code only has
671 to include the HTTP header "Youtubedl-No-Compression", which will be
672 removed before making the real request.
674 Part of this code was copied from:
676 http://techknack.net/python-urllib2-handlers/
678 Andrew Rowls, the author of that code, agreed to release it to the
682 def __init__(self, params, *args, **kwargs):
683 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
684 self._params = params
686 def http_open(self, req):
687 return self.do_open(functools.partial(
688 _create_http_connection, self, compat_http_client.HTTPConnection, False),
694 return zlib.decompress(data, -zlib.MAX_WBITS)
696 return zlib.decompress(data)
699 def addinfourl_wrapper(stream, headers, url, code):
700 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
701 return compat_urllib_request.addinfourl(stream, headers, url, code)
702 ret = compat_urllib_request.addinfourl(stream, headers, url)
706 def http_request(self, req):
707 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
708 # always respected by websites, some tend to give out URLs with non percent-encoded
709 # non-ASCII characters (see telemb.py, ard.py [#3412])
710 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
711 # To work around aforementioned issue we will replace request's original URL with
712 # percent-encoded one
713 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
714 # the code of this workaround has been moved here from YoutubeDL.urlopen()
715 url = req.get_full_url()
716 url_escaped = escape_url(url)
718 # Substitute URL if any change after escaping
719 if url != url_escaped:
720 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
722 url_escaped, data=req.data, headers=req.headers,
723 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
724 new_req.timeout = req.timeout
727 for h, v in std_headers.items():
728 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
729 # The dict keys are capitalized because of this bug by urllib
730 if h.capitalize() not in req.headers:
732 if 'Youtubedl-no-compression' in req.headers:
733 if 'Accept-encoding' in req.headers:
734 del req.headers['Accept-encoding']
735 del req.headers['Youtubedl-no-compression']
737 if sys.version_info < (2, 7) and '#' in req.get_full_url():
738 # Python 2.6 is brain-dead when it comes to fragments
739 req._Request__original = req._Request__original.partition('#')[0]
740 req._Request__r_type = req._Request__r_type.partition('#')[0]
744 def http_response(self, req, resp):
747 if resp.headers.get('Content-encoding', '') == 'gzip':
748 content = resp.read()
749 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
751 uncompressed = io.BytesIO(gz.read())
752 except IOError as original_ioerror:
753 # There may be junk add the end of the file
754 # See http://stackoverflow.com/q/4928560/35070 for details
755 for i in range(1, 1024):
757 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
758 uncompressed = io.BytesIO(gz.read())
763 raise original_ioerror
764 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
765 resp.msg = old_resp.msg
767 if resp.headers.get('Content-encoding', '') == 'deflate':
768 gz = io.BytesIO(self.deflate(resp.read()))
769 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
770 resp.msg = old_resp.msg
771 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
772 # https://github.com/rg3/youtube-dl/issues/6457).
773 if 300 <= resp.code < 400:
774 location = resp.headers.get('Location')
776 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
777 if sys.version_info >= (3, 0):
778 location = location.encode('iso-8859-1').decode('utf-8')
779 location_escaped = escape_url(location)
780 if location != location_escaped:
781 del resp.headers['Location']
782 resp.headers['Location'] = location_escaped
785 https_request = http_request
786 https_response = http_response
789 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
790 def __init__(self, params, https_conn_class=None, *args, **kwargs):
791 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
792 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
793 self._params = params
795 def https_open(self, req):
797 if hasattr(self, '_context'): # python > 2.6
798 kwargs['context'] = self._context
799 if hasattr(self, '_check_hostname'): # python 3.x
800 kwargs['check_hostname'] = self._check_hostname
801 return self.do_open(functools.partial(
802 _create_http_connection, self, self._https_conn_class, True),
806 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
807 def __init__(self, cookiejar=None):
808 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
810 def http_response(self, request, response):
811 # Python 2 will choke on next HTTP request in row if there are non-ASCII
812 # characters in Set-Cookie HTTP header of last response (see
813 # https://github.com/rg3/youtube-dl/issues/6769).
814 # In order to at least prevent crashing we will percent encode Set-Cookie
815 # header before HTTPCookieProcessor starts processing it.
816 # if sys.version_info < (3, 0) and response.headers:
817 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
818 # set_cookie = response.headers.get(set_cookie_header)
820 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
821 # if set_cookie != set_cookie_escaped:
822 # del response.headers[set_cookie_header]
823 # response.headers[set_cookie_header] = set_cookie_escaped
824 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
826 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
827 https_response = http_response
830 def parse_iso8601(date_str, delimiter='T', timezone=None):
831 """ Return a UNIX timestamp from the given date """
836 date_str = re.sub(r'\.[0-9]+', '', date_str)
840 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
843 timezone = datetime.timedelta()
845 date_str = date_str[:-len(m.group(0))]
846 if not m.group('sign'):
847 timezone = datetime.timedelta()
849 sign = 1 if m.group('sign') == '+' else -1
850 timezone = datetime.timedelta(
851 hours=sign * int(m.group('hours')),
852 minutes=sign * int(m.group('minutes')))
854 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
855 dt = datetime.datetime.strptime(date_str, date_format) - timezone
856 return calendar.timegm(dt.timetuple())
861 def unified_strdate(date_str, day_first=True):
862 """Return a string with the date in the format YYYYMMDD"""
868 date_str = date_str.replace(',', ' ')
869 # %z (UTC offset) is only supported in python>=3.2
870 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
871 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
872 # Remove AM/PM + timezone
873 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
875 format_expressions = [
880 '%b %dst %Y %I:%M%p',
881 '%b %dnd %Y %I:%M%p',
882 '%b %dth %Y %I:%M%p',
888 '%Y-%m-%d %H:%M:%S.%f',
891 '%Y-%m-%dT%H:%M:%SZ',
892 '%Y-%m-%dT%H:%M:%S.%fZ',
893 '%Y-%m-%dT%H:%M:%S.%f0Z',
895 '%Y-%m-%dT%H:%M:%S.%f',
899 format_expressions.extend([
907 format_expressions.extend([
914 for expression in format_expressions:
916 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
919 if upload_date is None:
920 timetuple = email.utils.parsedate_tz(date_str)
922 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
923 if upload_date is not None:
924 return compat_str(upload_date)
927 def determine_ext(url, default_ext='unknown_video'):
930 guess = url.partition('?')[0].rpartition('.')[2]
931 if re.match(r'^[A-Za-z0-9]+$', guess):
937 def subtitles_filename(filename, sub_lang, sub_format):
938 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
941 def date_from_str(date_str):
943 Return a datetime object from a string in the format YYYYMMDD or
944 (now|today)[+-][0-9](day|week|month|year)(s)?"""
945 today = datetime.date.today()
946 if date_str in ('now', 'today'):
948 if date_str == 'yesterday':
949 return today - datetime.timedelta(days=1)
950 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
951 if match is not None:
952 sign = match.group('sign')
953 time = int(match.group('time'))
956 unit = match.group('unit')
957 # A bad aproximation?
965 delta = datetime.timedelta(**{unit: time})
967 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
970 def hyphenate_date(date_str):
972 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
973 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
974 if match is not None:
975 return '-'.join(match.groups())
980 class DateRange(object):
981 """Represents a time interval between two dates"""
983 def __init__(self, start=None, end=None):
984 """start and end must be strings in the format accepted by date"""
985 if start is not None:
986 self.start = date_from_str(start)
988 self.start = datetime.datetime.min.date()
990 self.end = date_from_str(end)
992 self.end = datetime.datetime.max.date()
993 if self.start > self.end:
994 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
998 """Returns a range that only contains the given day"""
1001 def __contains__(self, date):
1002 """Check if the date is in the range"""
1003 if not isinstance(date, datetime.date):
1004 date = date_from_str(date)
1005 return self.start <= date <= self.end
1008 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1011 def platform_name():
1012 """ Returns the platform name as a compat_str """
1013 res = platform.platform()
1014 if isinstance(res, bytes):
1015 res = res.decode(preferredencoding())
1017 assert isinstance(res, compat_str)
1021 def _windows_write_string(s, out):
1022 """ Returns True if the string was written using special methods,
1023 False if it has yet to be written out."""
1024 # Adapted from http://stackoverflow.com/a/3259271/35070
1027 import ctypes.wintypes
1035 fileno = out.fileno()
1036 except AttributeError:
1037 # If the output stream doesn't have a fileno, it's virtual
1039 except io.UnsupportedOperation:
1040 # Some strange Windows pseudo files?
1042 if fileno not in WIN_OUTPUT_IDS:
1045 GetStdHandle = ctypes.WINFUNCTYPE(
1046 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1047 (b"GetStdHandle", ctypes.windll.kernel32))
1048 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1050 WriteConsoleW = ctypes.WINFUNCTYPE(
1051 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1052 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1053 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1054 written = ctypes.wintypes.DWORD(0)
1056 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1057 FILE_TYPE_CHAR = 0x0002
1058 FILE_TYPE_REMOTE = 0x8000
1059 GetConsoleMode = ctypes.WINFUNCTYPE(
1060 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1061 ctypes.POINTER(ctypes.wintypes.DWORD))(
1062 (b"GetConsoleMode", ctypes.windll.kernel32))
1063 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1065 def not_a_console(handle):
1066 if handle == INVALID_HANDLE_VALUE or handle is None:
1068 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1069 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1071 if not_a_console(h):
1074 def next_nonbmp_pos(s):
1076 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1077 except StopIteration:
1081 count = min(next_nonbmp_pos(s), 1024)
1083 ret = WriteConsoleW(
1084 h, s, count if count else 2, ctypes.byref(written), None)
1086 raise OSError('Failed to write string')
1087 if not count: # We just wrote a non-BMP character
1088 assert written.value == 2
1091 assert written.value > 0
1092 s = s[written.value:]
1096 def write_string(s, out=None, encoding=None):
1099 assert type(s) == compat_str
1101 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1102 if _windows_write_string(s, out):
1105 if ('b' in getattr(out, 'mode', '') or
1106 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1107 byt = s.encode(encoding or preferredencoding(), 'ignore')
1109 elif hasattr(out, 'buffer'):
1110 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1111 byt = s.encode(enc, 'ignore')
1112 out.buffer.write(byt)
1118 def bytes_to_intlist(bs):
1121 if isinstance(bs[0], int): # Python 3
1124 return [ord(c) for c in bs]
1127 def intlist_to_bytes(xs):
1130 return struct_pack('%dB' % len(xs), *xs)
1133 # Cross-platform file locking
1134 if sys.platform == 'win32':
1135 import ctypes.wintypes
1138 class OVERLAPPED(ctypes.Structure):
1140 ('Internal', ctypes.wintypes.LPVOID),
1141 ('InternalHigh', ctypes.wintypes.LPVOID),
1142 ('Offset', ctypes.wintypes.DWORD),
1143 ('OffsetHigh', ctypes.wintypes.DWORD),
1144 ('hEvent', ctypes.wintypes.HANDLE),
1147 kernel32 = ctypes.windll.kernel32
1148 LockFileEx = kernel32.LockFileEx
1149 LockFileEx.argtypes = [
1150 ctypes.wintypes.HANDLE, # hFile
1151 ctypes.wintypes.DWORD, # dwFlags
1152 ctypes.wintypes.DWORD, # dwReserved
1153 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1154 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1155 ctypes.POINTER(OVERLAPPED) # Overlapped
1157 LockFileEx.restype = ctypes.wintypes.BOOL
1158 UnlockFileEx = kernel32.UnlockFileEx
1159 UnlockFileEx.argtypes = [
1160 ctypes.wintypes.HANDLE, # hFile
1161 ctypes.wintypes.DWORD, # dwReserved
1162 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1163 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1164 ctypes.POINTER(OVERLAPPED) # Overlapped
1166 UnlockFileEx.restype = ctypes.wintypes.BOOL
1167 whole_low = 0xffffffff
1168 whole_high = 0x7fffffff
1170 def _lock_file(f, exclusive):
1171 overlapped = OVERLAPPED()
1172 overlapped.Offset = 0
1173 overlapped.OffsetHigh = 0
1174 overlapped.hEvent = 0
1175 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1176 handle = msvcrt.get_osfhandle(f.fileno())
1177 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1178 whole_low, whole_high, f._lock_file_overlapped_p):
1179 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1181 def _unlock_file(f):
1182 assert f._lock_file_overlapped_p
1183 handle = msvcrt.get_osfhandle(f.fileno())
1184 if not UnlockFileEx(handle, 0,
1185 whole_low, whole_high, f._lock_file_overlapped_p):
1186 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1191 def _lock_file(f, exclusive):
1192 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1194 def _unlock_file(f):
1195 fcntl.flock(f, fcntl.LOCK_UN)
1198 class locked_file(object):
1199 def __init__(self, filename, mode, encoding=None):
1200 assert mode in ['r', 'a', 'w']
1201 self.f = io.open(filename, mode, encoding=encoding)
1204 def __enter__(self):
1205 exclusive = self.mode != 'r'
1207 _lock_file(self.f, exclusive)
1213 def __exit__(self, etype, value, traceback):
1215 _unlock_file(self.f)
1222 def write(self, *args):
1223 return self.f.write(*args)
1225 def read(self, *args):
1226 return self.f.read(*args)
1229 def get_filesystem_encoding():
1230 encoding = sys.getfilesystemencoding()
1231 return encoding if encoding is not None else 'utf-8'
1234 def shell_quote(args):
1236 encoding = get_filesystem_encoding()
1238 if isinstance(a, bytes):
1239 # We may get a filename encoded with 'encodeFilename'
1240 a = a.decode(encoding)
1241 quoted_args.append(pipes.quote(a))
1242 return ' '.join(quoted_args)
1245 def smuggle_url(url, data):
1246 """ Pass additional data in a URL for internal use. """
1248 sdata = compat_urllib_parse.urlencode(
1249 {'__youtubedl_smuggle': json.dumps(data)})
1250 return url + '#' + sdata
1253 def unsmuggle_url(smug_url, default=None):
1254 if '#__youtubedl_smuggle' not in smug_url:
1255 return smug_url, default
1256 url, _, sdata = smug_url.rpartition('#')
1257 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1258 data = json.loads(jsond)
1262 def format_bytes(bytes):
1265 if type(bytes) is str:
1266 bytes = float(bytes)
1270 exponent = int(math.log(bytes, 1024.0))
1271 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1272 converted = float(bytes) / float(1024 ** exponent)
1273 return '%.2f%s' % (converted, suffix)
1276 def parse_filesize(s):
1280 # The lower-case forms are of course incorrect and inofficial,
1281 # but we support those too
1319 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1321 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1325 num_str = m.group('num').replace(',', '.')
1326 mult = _UNIT_TABLE[m.group('unit')]
1327 return int(float(num_str) * mult)
1330 def month_by_name(name):
1331 """ Return the number of a month by (locale-independently) English name """
1334 return ENGLISH_MONTH_NAMES.index(name) + 1
1339 def month_by_abbreviation(abbrev):
1340 """ Return the number of a month by (locale-independently) English
1344 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1349 def fix_xml_ampersands(xml_str):
1350 """Replace all the '&' by '&' in XML"""
1352 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1357 def setproctitle(title):
1358 assert isinstance(title, compat_str)
1360 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1363 title_bytes = title.encode('utf-8')
1364 buf = ctypes.create_string_buffer(len(title_bytes))
1365 buf.value = title_bytes
1367 libc.prctl(15, buf, 0, 0, 0)
1368 except AttributeError:
1369 return # Strange libc, just skip this
1372 def remove_start(s, start):
1373 if s.startswith(start):
1374 return s[len(start):]
1378 def remove_end(s, end):
1380 return s[:-len(end)]
1384 def url_basename(url):
1385 path = compat_urlparse.urlparse(url).path
1386 return path.strip('/').split('/')[-1]
1389 class HEADRequest(compat_urllib_request.Request):
1390 def get_method(self):
1394 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1397 v = getattr(v, get_attr, None)
1403 return int(v) * invscale // scale
1408 def str_or_none(v, default=None):
1409 return default if v is None else compat_str(v)
1412 def str_to_int(int_str):
1413 """ A more relaxed version of int_or_none """
1416 int_str = re.sub(r'[,\.\+]', '', int_str)
1420 def float_or_none(v, scale=1, invscale=1, default=None):
1424 return float(v) * invscale / scale
1429 def parse_duration(s):
1430 if not isinstance(s, compat_basestring):
1438 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1439 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1441 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1444 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1445 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1447 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1449 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1454 if m.group('only_mins'):
1455 return float_or_none(m.group('only_mins'), invscale=60)
1456 if m.group('only_hours'):
1457 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1459 res += int(m.group('secs'))
1460 if m.group('mins_reversed'):
1461 res += int(m.group('mins_reversed')) * 60
1463 res += int(m.group('mins')) * 60
1464 if m.group('hours'):
1465 res += int(m.group('hours')) * 60 * 60
1466 if m.group('hours_reversed'):
1467 res += int(m.group('hours_reversed')) * 60 * 60
1469 res += int(m.group('days')) * 24 * 60 * 60
1471 res += float(m.group('ms'))
1475 def prepend_extension(filename, ext, expected_real_ext=None):
1476 name, real_ext = os.path.splitext(filename)
1478 '{0}.{1}{2}'.format(name, ext, real_ext)
1479 if not expected_real_ext or real_ext[1:] == expected_real_ext
1480 else '{0}.{1}'.format(filename, ext))
1483 def replace_extension(filename, ext, expected_real_ext=None):
1484 name, real_ext = os.path.splitext(filename)
1485 return '{0}.{1}'.format(
1486 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1490 def check_executable(exe, args=[]):
1491 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1492 args can be a list of arguments for a short output (like -version) """
1494 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1500 def get_exe_version(exe, args=['--version'],
1501 version_re=None, unrecognized='present'):
1502 """ Returns the version of the specified executable,
1503 or False if the executable is not present """
1505 out, _ = subprocess.Popen(
1506 [encodeArgument(exe)] + args,
1507 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1510 if isinstance(out, bytes): # Python 2.x
1511 out = out.decode('ascii', 'ignore')
1512 return detect_exe_version(out, version_re, unrecognized)
1515 def detect_exe_version(output, version_re=None, unrecognized='present'):
1516 assert isinstance(output, compat_str)
1517 if version_re is None:
1518 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1519 m = re.search(version_re, output)
1526 class PagedList(object):
1528 # This is only useful for tests
1529 return len(self.getslice())
1532 class OnDemandPagedList(PagedList):
1533 def __init__(self, pagefunc, pagesize):
1534 self._pagefunc = pagefunc
1535 self._pagesize = pagesize
1537 def getslice(self, start=0, end=None):
1539 for pagenum in itertools.count(start // self._pagesize):
1540 firstid = pagenum * self._pagesize
1541 nextfirstid = pagenum * self._pagesize + self._pagesize
1542 if start >= nextfirstid:
1545 page_results = list(self._pagefunc(pagenum))
1548 start % self._pagesize
1549 if firstid <= start < nextfirstid
1553 ((end - 1) % self._pagesize) + 1
1554 if (end is not None and firstid <= end <= nextfirstid)
1557 if startv != 0 or endv is not None:
1558 page_results = page_results[startv:endv]
1559 res.extend(page_results)
1561 # A little optimization - if current page is not "full", ie. does
1562 # not contain page_size videos then we can assume that this page
1563 # is the last one - there are no more ids on further pages -
1564 # i.e. no need to query again.
1565 if len(page_results) + startv < self._pagesize:
1568 # If we got the whole page, but the next page is not interesting,
1569 # break out early as well
1570 if end == nextfirstid:
1575 class InAdvancePagedList(PagedList):
1576 def __init__(self, pagefunc, pagecount, pagesize):
1577 self._pagefunc = pagefunc
1578 self._pagecount = pagecount
1579 self._pagesize = pagesize
1581 def getslice(self, start=0, end=None):
1583 start_page = start // self._pagesize
1585 self._pagecount if end is None else (end // self._pagesize + 1))
1586 skip_elems = start - start_page * self._pagesize
1587 only_more = None if end is None else end - start
1588 for pagenum in range(start_page, end_page):
1589 page = list(self._pagefunc(pagenum))
1591 page = page[skip_elems:]
1593 if only_more is not None:
1594 if len(page) < only_more:
1595 only_more -= len(page)
1597 page = page[:only_more]
1604 def uppercase_escape(s):
1605 unicode_escape = codecs.getdecoder('unicode_escape')
1607 r'\\U[0-9a-fA-F]{8}',
1608 lambda m: unicode_escape(m.group(0))[0],
1612 def lowercase_escape(s):
1613 unicode_escape = codecs.getdecoder('unicode_escape')
1615 r'\\u[0-9a-fA-F]{4}',
1616 lambda m: unicode_escape(m.group(0))[0],
1620 def escape_rfc3986(s):
1621 """Escape non-ASCII characters as suggested by RFC 3986"""
1622 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1623 s = s.encode('utf-8')
1624 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1627 def escape_url(url):
1628 """Escape URL as suggested by RFC 3986"""
1629 url_parsed = compat_urllib_parse_urlparse(url)
1630 return url_parsed._replace(
1631 path=escape_rfc3986(url_parsed.path),
1632 params=escape_rfc3986(url_parsed.params),
1633 query=escape_rfc3986(url_parsed.query),
1634 fragment=escape_rfc3986(url_parsed.fragment)
1638 struct.pack('!I', 0)
1640 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1641 def struct_pack(spec, *args):
1642 if isinstance(spec, compat_str):
1643 spec = spec.encode('ascii')
1644 return struct.pack(spec, *args)
1646 def struct_unpack(spec, *args):
1647 if isinstance(spec, compat_str):
1648 spec = spec.encode('ascii')
1649 return struct.unpack(spec, *args)
1651 struct_pack = struct.pack
1652 struct_unpack = struct.unpack
1655 def read_batch_urls(batch_fd):
1657 if not isinstance(url, compat_str):
1658 url = url.decode('utf-8', 'replace')
1659 BOM_UTF8 = '\xef\xbb\xbf'
1660 if url.startswith(BOM_UTF8):
1661 url = url[len(BOM_UTF8):]
1663 if url.startswith(('#', ';', ']')):
1667 with contextlib.closing(batch_fd) as fd:
1668 return [url for url in map(fixup, fd) if url]
1671 def urlencode_postdata(*args, **kargs):
1672 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1675 def encode_dict(d, encoding='utf-8'):
1676 return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
1688 def parse_age_limit(s):
1691 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1692 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1695 def strip_jsonp(code):
1697 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1700 def js_to_json(code):
1703 if v in ('true', 'false', 'null'):
1705 if v.startswith('"'):
1706 v = re.sub(r"\\'", "'", v[1:-1])
1707 elif v.startswith("'"):
1709 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1716 res = re.sub(r'''(?x)
1717 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1718 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1719 [a-zA-Z_][.a-zA-Z_0-9]*
1721 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1725 def qualities(quality_ids):
1726 """ Get a numeric quality value out of a list of possible values """
1729 return quality_ids.index(qid)
1735 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1738 def limit_length(s, length):
1739 """ Add ellipses to overly long strings """
1744 return s[:length - len(ELLIPSES)] + ELLIPSES
1748 def version_tuple(v):
1749 return tuple(int(e) for e in re.split(r'[-.]', v))
1752 def is_outdated_version(version, limit, assume_new=True):
1754 return not assume_new
1756 return version_tuple(version) < version_tuple(limit)
1758 return not assume_new
1761 def ytdl_is_updateable():
1762 """ Returns if youtube-dl can be updated with -U """
1763 from zipimport import zipimporter
1765 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1768 def args_to_str(args):
1769 # Get a short string representation for a subprocess command
1770 return ' '.join(shlex_quote(a) for a in args)
1773 def mimetype2ext(mt):
1774 _, _, res = mt.rpartition('/')
1778 'x-mp4-fragmented': 'mp4',
1783 def urlhandle_detect_ext(url_handle):
1786 getheader = lambda h: url_handle.headers[h]
1787 except AttributeError: # Python < 3
1788 getheader = url_handle.info().getheader
1790 cd = getheader('Content-Disposition')
1792 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1794 e = determine_ext(m.group('filename'), default_ext=None)
1798 return mimetype2ext(getheader('Content-Type'))
1801 def encode_data_uri(data, mime_type):
1802 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1805 def age_restricted(content_limit, age_limit):
1806 """ Returns True iff the content should be blocked """
1808 if age_limit is None: # No limit set
1810 if content_limit is None:
1811 return False # Content available for everyone
1812 return age_limit < content_limit
1815 def is_html(first_bytes):
1816 """ Detect whether a file contains HTML by examining its first bytes. """
1819 (b'\xef\xbb\xbf', 'utf-8'),
1820 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1821 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1822 (b'\xff\xfe', 'utf-16-le'),
1823 (b'\xfe\xff', 'utf-16-be'),
1825 for bom, enc in BOMS:
1826 if first_bytes.startswith(bom):
1827 s = first_bytes[len(bom):].decode(enc, 'replace')
1830 s = first_bytes.decode('utf-8', 'replace')
1832 return re.match(r'^\s*<', s)
1835 def determine_protocol(info_dict):
1836 protocol = info_dict.get('protocol')
1837 if protocol is not None:
1840 url = info_dict['url']
1841 if url.startswith('rtmp'):
1843 elif url.startswith('mms'):
1845 elif url.startswith('rtsp'):
1848 ext = determine_ext(url)
1854 return compat_urllib_parse_urlparse(url).scheme
1857 def render_table(header_row, data):
1858 """ Render a list of rows, each as a list of values """
1859 table = [header_row] + data
1860 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1861 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1862 return '\n'.join(format_str % tuple(row) for row in table)
1865 def _match_one(filter_part, dct):
1866 COMPARISON_OPERATORS = {
1874 operator_rex = re.compile(r'''(?x)\s*
1876 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1878 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1879 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1882 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1883 m = operator_rex.search(filter_part)
1885 op = COMPARISON_OPERATORS[m.group('op')]
1886 if m.group('strval') is not None:
1887 if m.group('op') not in ('=', '!='):
1889 'Operator %s does not support string values!' % m.group('op'))
1890 comparison_value = m.group('strval')
1893 comparison_value = int(m.group('intval'))
1895 comparison_value = parse_filesize(m.group('intval'))
1896 if comparison_value is None:
1897 comparison_value = parse_filesize(m.group('intval') + 'B')
1898 if comparison_value is None:
1900 'Invalid integer value %r in filter part %r' % (
1901 m.group('intval'), filter_part))
1902 actual_value = dct.get(m.group('key'))
1903 if actual_value is None:
1904 return m.group('none_inclusive')
1905 return op(actual_value, comparison_value)
1908 '': lambda v: v is not None,
1909 '!': lambda v: v is None,
1911 operator_rex = re.compile(r'''(?x)\s*
1912 (?P<op>%s)\s*(?P<key>[a-z_]+)
1914 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1915 m = operator_rex.search(filter_part)
1917 op = UNARY_OPERATORS[m.group('op')]
1918 actual_value = dct.get(m.group('key'))
1919 return op(actual_value)
1921 raise ValueError('Invalid filter part %r' % filter_part)
1924 def match_str(filter_str, dct):
1925 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1928 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1931 def match_filter_func(filter_str):
1932 def _match_func(info_dict):
1933 if match_str(filter_str, info_dict):
1936 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1937 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1941 def parse_dfxp_time_expr(time_expr):
1945 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1947 return float(mobj.group('time_offset'))
1949 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1951 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1954 def srt_subtitles_timecode(seconds):
1955 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1958 def dfxp2srt(dfxp_data):
1959 _x = functools.partial(xpath_with_ns, ns_map={
1960 'ttml': 'http://www.w3.org/ns/ttml',
1961 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1964 def parse_node(node):
1965 str_or_empty = functools.partial(str_or_none, default='')
1967 out = str_or_empty(node.text)
1970 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1971 out += '\n' + str_or_empty(child.tail)
1972 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1973 out += str_or_empty(parse_node(child))
1975 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1979 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
1981 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1984 raise ValueError('Invalid dfxp/TTML subtitle')
1986 for para, index in zip(paras, itertools.count(1)):
1987 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1988 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1990 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1991 out.append('%d\n%s --> %s\n%s\n\n' % (
1993 srt_subtitles_timecode(begin_time),
1994 srt_subtitles_timecode(end_time),
2000 def cli_option(params, command_option, param):
2001 param = params.get(param)
2002 return [command_option, param] if param is not None else []
2005 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2006 param = params.get(param)
2007 assert isinstance(param, bool)
2009 return [command_option + separator + (true_value if param else false_value)]
2010 return [command_option, true_value if param else false_value]
2013 def cli_valueless_option(params, command_option, param, expected_value=True):
2014 param = params.get(param)
2015 return [command_option] if param == expected_value else []
2018 def cli_configuration_args(params, param, default=[]):
2019 ex_args = params.get(param)
2022 assert isinstance(ex_args, list)
2026 class ISO639Utils(object):
2027 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2216 def short2long(cls, code):
2217 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2218 return cls._lang_map.get(code[:2])
2221 def long2short(cls, code):
2222 """Convert language code from ISO 639-2/T to ISO 639-1"""
2223 for short_name, long_name in cls._lang_map.items():
2224 if long_name == code:
2228 class ISO3166Utils(object):
2229 # From http://data.okfn.org/data/core/country-list
2231 'AF': 'Afghanistan',
2232 'AX': 'Ã…land Islands',
2235 'AS': 'American Samoa',
2240 'AG': 'Antigua and Barbuda',
2257 'BO': 'Bolivia, Plurinational State of',
2258 'BQ': 'Bonaire, Sint Eustatius and Saba',
2259 'BA': 'Bosnia and Herzegovina',
2261 'BV': 'Bouvet Island',
2263 'IO': 'British Indian Ocean Territory',
2264 'BN': 'Brunei Darussalam',
2266 'BF': 'Burkina Faso',
2272 'KY': 'Cayman Islands',
2273 'CF': 'Central African Republic',
2277 'CX': 'Christmas Island',
2278 'CC': 'Cocos (Keeling) Islands',
2282 'CD': 'Congo, the Democratic Republic of the',
2283 'CK': 'Cook Islands',
2285 'CI': 'Côte d\'Ivoire',
2290 'CZ': 'Czech Republic',
2294 'DO': 'Dominican Republic',
2297 'SV': 'El Salvador',
2298 'GQ': 'Equatorial Guinea',
2302 'FK': 'Falkland Islands (Malvinas)',
2303 'FO': 'Faroe Islands',
2307 'GF': 'French Guiana',
2308 'PF': 'French Polynesia',
2309 'TF': 'French Southern Territories',
2324 'GW': 'Guinea-Bissau',
2327 'HM': 'Heard Island and McDonald Islands',
2328 'VA': 'Holy See (Vatican City State)',
2335 'IR': 'Iran, Islamic Republic of',
2338 'IM': 'Isle of Man',
2348 'KP': 'Korea, Democratic People\'s Republic of',
2349 'KR': 'Korea, Republic of',
2352 'LA': 'Lao People\'s Democratic Republic',
2358 'LI': 'Liechtenstein',
2362 'MK': 'Macedonia, the Former Yugoslav Republic of',
2369 'MH': 'Marshall Islands',
2375 'FM': 'Micronesia, Federated States of',
2376 'MD': 'Moldova, Republic of',
2387 'NL': 'Netherlands',
2388 'NC': 'New Caledonia',
2389 'NZ': 'New Zealand',
2394 'NF': 'Norfolk Island',
2395 'MP': 'Northern Mariana Islands',
2400 'PS': 'Palestine, State of',
2402 'PG': 'Papua New Guinea',
2405 'PH': 'Philippines',
2409 'PR': 'Puerto Rico',
2413 'RU': 'Russian Federation',
2415 'BL': 'Saint Barthélemy',
2416 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2417 'KN': 'Saint Kitts and Nevis',
2418 'LC': 'Saint Lucia',
2419 'MF': 'Saint Martin (French part)',
2420 'PM': 'Saint Pierre and Miquelon',
2421 'VC': 'Saint Vincent and the Grenadines',
2424 'ST': 'Sao Tome and Principe',
2425 'SA': 'Saudi Arabia',
2429 'SL': 'Sierra Leone',
2431 'SX': 'Sint Maarten (Dutch part)',
2434 'SB': 'Solomon Islands',
2436 'ZA': 'South Africa',
2437 'GS': 'South Georgia and the South Sandwich Islands',
2438 'SS': 'South Sudan',
2443 'SJ': 'Svalbard and Jan Mayen',
2446 'CH': 'Switzerland',
2447 'SY': 'Syrian Arab Republic',
2448 'TW': 'Taiwan, Province of China',
2450 'TZ': 'Tanzania, United Republic of',
2452 'TL': 'Timor-Leste',
2456 'TT': 'Trinidad and Tobago',
2459 'TM': 'Turkmenistan',
2460 'TC': 'Turks and Caicos Islands',
2464 'AE': 'United Arab Emirates',
2465 'GB': 'United Kingdom',
2466 'US': 'United States',
2467 'UM': 'United States Minor Outlying Islands',
2471 'VE': 'Venezuela, Bolivarian Republic of',
2473 'VG': 'Virgin Islands, British',
2474 'VI': 'Virgin Islands, U.S.',
2475 'WF': 'Wallis and Futuna',
2476 'EH': 'Western Sahara',
2483 def short2full(cls, code):
2484 """Convert an ISO 3166-2 country code to the corresponding full name"""
2485 return cls._country_map.get(code.upper())
2488 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2489 def __init__(self, proxies=None):
2490 # Set default handlers
2491 for type in ('http', 'https'):
2492 setattr(self, '%s_open' % type,
2493 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2494 meth(r, proxy, type))
2495 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2497 def proxy_open(self, req, proxy, type):
2498 req_proxy = req.headers.get('Ytdl-request-proxy')
2499 if req_proxy is not None:
2501 del req.headers['Ytdl-request-proxy']
2503 if proxy == '__noproxy__':
2504 return None # No Proxy
2505 return compat_urllib_request.ProxyHandler.proxy_open(
2506 self, req, proxy, type)