2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
39 compat_etree_fromstring,
44 compat_socket_create_connection,
48 compat_urllib_parse_urlparse,
49 compat_urllib_request,
55 # This is not clearly defined otherwise
56 compiled_regex_type = type(re.compile(''))
59 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
60 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
61 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
62 'Accept-Encoding': 'gzip, deflate',
63 'Accept-Language': 'en-us,en;q=0.5',
69 ENGLISH_MONTH_NAMES = [
70 'January', 'February', 'March', 'April', 'May', 'June',
71 'July', 'August', 'September', 'October', 'November', 'December']
74 def preferredencoding():
75 """Get preferred encoding.
77 Returns the best encoding scheme for the system, based on
78 locale.getpreferredencoding() and some further tweaks.
81 pref = locale.getpreferredencoding()
89 def write_json_file(obj, fn):
90 """ Encode obj as JSON and write it to fn, atomically if possible """
92 fn = encodeFilename(fn)
93 if sys.version_info < (3, 0) and sys.platform != 'win32':
94 encoding = get_filesystem_encoding()
95 # os.path.basename returns a bytes object, but NamedTemporaryFile
96 # will fail if the filename contains non ascii characters unless we
97 # use a unicode object
98 path_basename = lambda f: os.path.basename(fn).decode(encoding)
99 # the same for os.path.dirname
100 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
102 path_basename = os.path.basename
103 path_dirname = os.path.dirname
107 'prefix': path_basename(fn) + '.',
108 'dir': path_dirname(fn),
112 # In Python 2.x, json.dump expects a bytestream.
113 # In Python 3.x, it writes to a character stream
114 if sys.version_info < (3, 0):
122 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
127 if sys.platform == 'win32':
128 # Need to remove existing file on Windows, else os.rename raises
129 # WindowsError or FileExistsError.
134 os.rename(tf.name, fn)
143 if sys.version_info >= (2, 7):
144 def find_xpath_attr(node, xpath, key, val=None):
145 """ Find the xpath xpath[@key=val] """
146 assert re.match(r'^[a-zA-Z_-]+$', key)
148 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
149 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
150 return node.find(expr)
152 def find_xpath_attr(node, xpath, key, val=None):
153 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
154 # .//node does not match if a node is a direct child of . !
155 if isinstance(xpath, compat_str):
156 xpath = xpath.encode('ascii')
158 for f in node.findall(xpath):
159 if key not in f.attrib:
161 if val is None or f.attrib.get(key) == val:
165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
166 # the namespace parameter
169 def xpath_with_ns(path, ns_map):
170 components = [c.split(':') for c in path.split('/')]
174 replaced.append(c[0])
177 replaced.append('{%s}%s' % (ns_map[ns], tag))
178 return '/'.join(replaced)
181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
182 def _find_xpath(xpath):
183 if sys.version_info < (2, 7): # Crazy 2.6
184 xpath = xpath.encode('ascii')
185 return node.find(xpath)
187 if isinstance(xpath, (str, compat_str)):
188 n = _find_xpath(xpath)
196 if default is not NO_DEFAULT:
199 name = xpath if name is None else name
200 raise ExtractorError('Could not find XML element %s' % name)
206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
207 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
208 if n is None or n == default:
211 if default is not NO_DEFAULT:
214 name = xpath if name is None else name
215 raise ExtractorError('Could not find XML element\'s text %s' % name)
221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
222 n = find_xpath_attr(node, xpath, key)
224 if default is not NO_DEFAULT:
227 name = '%s[@%s]' % (xpath, key) if name is None else name
228 raise ExtractorError('Could not find XML attribute %s' % name)
234 def get_element_by_id(id, html):
235 """Return the content of the tag with the specified ID in the passed HTML document"""
236 return get_element_by_attribute("id", id, html)
239 def get_element_by_attribute(attribute, value, html):
240 """Return the content of the tag with the specified attribute in the passed HTML document"""
242 m = re.search(r'''(?xs)
244 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
246 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
250 ''' % (re.escape(attribute), re.escape(value)), html)
254 res = m.group('content')
256 if res.startswith('"') or res.startswith("'"):
259 return unescapeHTML(res)
262 def clean_html(html):
263 """Clean an HTML snippet into a readable string"""
265 if html is None: # Convenience for sanitizing descriptions etc.
269 html = html.replace('\n', ' ')
270 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
271 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
273 html = re.sub('<.*?>', '', html)
274 # Replace html entities
275 html = unescapeHTML(html)
279 def sanitize_open(filename, open_mode):
280 """Try to open the given filename, and slightly tweak it if this fails.
282 Attempts to open the given filename. If this fails, it tries to change
283 the filename slightly, step by step, until it's either able to open it
284 or it fails and raises a final exception, like the standard open()
287 It returns the tuple (stream, definitive_file_name).
291 if sys.platform == 'win32':
293 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
294 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
295 stream = open(encodeFilename(filename), open_mode)
296 return (stream, filename)
297 except (IOError, OSError) as err:
298 if err.errno in (errno.EACCES,):
301 # In case of error, try to remove win32 forbidden chars
302 alt_filename = sanitize_path(filename)
303 if alt_filename == filename:
306 # An exception here should be caught in the caller
307 stream = open(encodeFilename(alt_filename), open_mode)
308 return (stream, alt_filename)
311 def timeconvert(timestr):
312 """Convert RFC 2822 defined time string into system timestamp"""
314 timetuple = email.utils.parsedate_tz(timestr)
315 if timetuple is not None:
316 timestamp = email.utils.mktime_tz(timetuple)
320 def sanitize_filename(s, restricted=False, is_id=False):
321 """Sanitizes a string so it could be used as part of a filename.
322 If restricted is set, use a stricter subset of allowed characters.
323 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
325 def replace_insane(char):
326 if char == '?' or ord(char) < 32 or ord(char) == 127:
329 return '' if restricted else '\''
331 return '_-' if restricted else ' -'
332 elif char in '\\/|*<>':
334 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
336 if restricted and ord(char) > 127:
341 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
342 result = ''.join(map(replace_insane, s))
344 while '__' in result:
345 result = result.replace('__', '_')
346 result = result.strip('_')
347 # Common case of "Foreign band name - English song title"
348 if restricted and result.startswith('-_'):
350 if result.startswith('-'):
351 result = '_' + result[len('-'):]
352 result = result.lstrip('.')
358 def sanitize_path(s):
359 """Sanitizes and normalizes path on Windows"""
360 if sys.platform != 'win32':
362 drive_or_unc, _ = os.path.splitdrive(s)
363 if sys.version_info < (2, 7) and not drive_or_unc:
364 drive_or_unc, _ = os.path.splitunc(s)
365 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
369 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
370 for path_part in norm_path]
372 sanitized_path.insert(0, drive_or_unc + os.path.sep)
373 return os.path.join(*sanitized_path)
376 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
377 # unwanted failures due to missing protocol
378 def sanitized_Request(url, *args, **kwargs):
379 return compat_urllib_request.Request(
380 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
383 def orderedSet(iterable):
384 """ Remove all duplicates from the input iterable """
392 def _htmlentity_transform(entity):
393 """Transforms an HTML entity to a character."""
394 # Known non-numeric HTML entity
395 if entity in compat_html_entities.name2codepoint:
396 return compat_chr(compat_html_entities.name2codepoint[entity])
398 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
400 numstr = mobj.group(1)
401 if numstr.startswith('x'):
403 numstr = '0%s' % numstr
406 # See https://github.com/rg3/youtube-dl/issues/7518
408 return compat_chr(int(numstr, base))
412 # Unknown entity in name, return its literal representation
413 return '&%s;' % entity
419 assert type(s) == compat_str
422 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
425 def get_subprocess_encoding():
426 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
427 # For subprocess calls, encode with locale encoding
428 # Refer to http://stackoverflow.com/a/9951851/35070
429 encoding = preferredencoding()
431 encoding = sys.getfilesystemencoding()
437 def encodeFilename(s, for_subprocess=False):
439 @param s The name of the file
442 assert type(s) == compat_str
444 # Python 3 has a Unicode API
445 if sys.version_info >= (3, 0):
448 # Pass '' directly to use Unicode APIs on Windows 2000 and up
449 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
450 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
451 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
454 return s.encode(get_subprocess_encoding(), 'ignore')
457 def decodeFilename(b, for_subprocess=False):
459 if sys.version_info >= (3, 0):
462 if not isinstance(b, bytes):
465 return b.decode(get_subprocess_encoding(), 'ignore')
468 def encodeArgument(s):
469 if not isinstance(s, compat_str):
470 # Legacy code that uses byte strings
471 # Uncomment the following line after fixing all post processors
472 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
473 s = s.decode('ascii')
474 return encodeFilename(s, True)
477 def decodeArgument(b):
478 return decodeFilename(b, True)
481 def decodeOption(optval):
484 if isinstance(optval, bytes):
485 optval = optval.decode(preferredencoding())
487 assert isinstance(optval, compat_str)
491 def formatSeconds(secs):
493 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
495 return '%d:%02d' % (secs // 60, secs % 60)
500 def make_HTTPS_handler(params, **kwargs):
501 opts_no_check_certificate = params.get('nocheckcertificate', False)
502 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
503 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
504 if opts_no_check_certificate:
505 context.check_hostname = False
506 context.verify_mode = ssl.CERT_NONE
508 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
511 # (create_default_context present but HTTPSHandler has no context=)
514 if sys.version_info < (3, 2):
515 return YoutubeDLHTTPSHandler(params, **kwargs)
517 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
518 context.verify_mode = (ssl.CERT_NONE
519 if opts_no_check_certificate
520 else ssl.CERT_REQUIRED)
521 context.set_default_verify_paths()
522 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
525 def bug_reports_message():
526 if ytdl_is_updateable():
527 update_cmd = 'type youtube-dl -U to update'
529 update_cmd = 'see https://yt-dl.org/update on how to update'
530 msg = '; please report this issue on https://yt-dl.org/bug .'
531 msg += ' Make sure you are using the latest version; %s.' % update_cmd
532 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
536 class ExtractorError(Exception):
537 """Error during info extraction."""
539 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
540 """ tb, if given, is the original traceback (so that it can be printed out).
541 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
544 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
546 if video_id is not None:
547 msg = video_id + ': ' + msg
549 msg += ' (caused by %r)' % cause
551 msg += bug_reports_message()
552 super(ExtractorError, self).__init__(msg)
555 self.exc_info = sys.exc_info() # preserve original exception
557 self.video_id = video_id
559 def format_traceback(self):
560 if self.traceback is None:
562 return ''.join(traceback.format_tb(self.traceback))
565 class UnsupportedError(ExtractorError):
566 def __init__(self, url):
567 super(UnsupportedError, self).__init__(
568 'Unsupported URL: %s' % url, expected=True)
572 class RegexNotFoundError(ExtractorError):
573 """Error when a regex didn't match"""
577 class DownloadError(Exception):
578 """Download Error exception.
580 This exception may be thrown by FileDownloader objects if they are not
581 configured to continue on errors. They will contain the appropriate
585 def __init__(self, msg, exc_info=None):
586 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
587 super(DownloadError, self).__init__(msg)
588 self.exc_info = exc_info
591 class SameFileError(Exception):
592 """Same File exception.
594 This exception will be thrown by FileDownloader objects if they detect
595 multiple files would have to be downloaded to the same file on disk.
600 class PostProcessingError(Exception):
601 """Post Processing exception.
603 This exception may be raised by PostProcessor's .run() method to
604 indicate an error in the postprocessing task.
607 def __init__(self, msg):
611 class MaxDownloadsReached(Exception):
612 """ --max-downloads limit has been reached. """
616 class UnavailableVideoError(Exception):
617 """Unavailable Format exception.
619 This exception will be thrown when a video is requested
620 in a format that is not available for that video.
625 class ContentTooShortError(Exception):
626 """Content Too Short exception.
628 This exception may be raised by FileDownloader objects when a file they
629 download is too small for what the server announced first, indicating
630 the connection was probably interrupted.
633 def __init__(self, downloaded, expected):
635 self.downloaded = downloaded
636 self.expected = expected
639 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
640 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
641 # expected HTTP responses to meet HTTP/1.0 or later (see also
642 # https://github.com/rg3/youtube-dl/issues/6727)
643 if sys.version_info < (3, 0):
644 kwargs[b'strict'] = True
645 hc = http_class(*args, **kwargs)
646 source_address = ydl_handler._params.get('source_address')
647 if source_address is not None:
648 sa = (source_address, 0)
649 if hasattr(hc, 'source_address'): # Python 2.7+
650 hc.source_address = sa
652 def _hc_connect(self, *args, **kwargs):
653 sock = compat_socket_create_connection(
654 (self.host, self.port), self.timeout, sa)
656 self.sock = ssl.wrap_socket(
657 sock, self.key_file, self.cert_file,
658 ssl_version=ssl.PROTOCOL_TLSv1)
661 hc.connect = functools.partial(_hc_connect, hc)
666 def handle_youtubedl_headers(headers):
667 if 'Youtubedl-no-compression' in headers:
668 filtered_headers = dict((k, v) for k, v in headers.items() if k.lower() != 'accept-encoding')
669 del filtered_headers['Youtubedl-no-compression']
670 return filtered_headers
675 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
676 """Handler for HTTP requests and responses.
678 This class, when installed with an OpenerDirector, automatically adds
679 the standard headers to every HTTP request and handles gzipped and
680 deflated responses from web servers. If compression is to be avoided in
681 a particular request, the original request in the program code only has
682 to include the HTTP header "Youtubedl-no-compression", which will be
683 removed before making the real request.
685 Part of this code was copied from:
687 http://techknack.net/python-urllib2-handlers/
689 Andrew Rowls, the author of that code, agreed to release it to the
693 def __init__(self, params, *args, **kwargs):
694 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
695 self._params = params
697 def http_open(self, req):
698 return self.do_open(functools.partial(
699 _create_http_connection, self, compat_http_client.HTTPConnection, False),
705 return zlib.decompress(data, -zlib.MAX_WBITS)
707 return zlib.decompress(data)
710 def addinfourl_wrapper(stream, headers, url, code):
711 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
712 return compat_urllib_request.addinfourl(stream, headers, url, code)
713 ret = compat_urllib_request.addinfourl(stream, headers, url)
717 def http_request(self, req):
718 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
719 # always respected by websites, some tend to give out URLs with non percent-encoded
720 # non-ASCII characters (see telemb.py, ard.py [#3412])
721 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
722 # To work around aforementioned issue we will replace request's original URL with
723 # percent-encoded one
724 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
725 # the code of this workaround has been moved here from YoutubeDL.urlopen()
726 url = req.get_full_url()
727 url_escaped = escape_url(url)
729 # Substitute URL if any change after escaping
730 if url != url_escaped:
731 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
733 url_escaped, data=req.data, headers=req.headers,
734 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
735 new_req.timeout = req.timeout
738 for h, v in std_headers.items():
739 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
740 # The dict keys are capitalized because of this bug by urllib
741 if h.capitalize() not in req.headers:
744 req.headers = handle_youtubedl_headers(req.headers)
746 if sys.version_info < (2, 7) and '#' in req.get_full_url():
747 # Python 2.6 is brain-dead when it comes to fragments
748 req._Request__original = req._Request__original.partition('#')[0]
749 req._Request__r_type = req._Request__r_type.partition('#')[0]
753 def http_response(self, req, resp):
756 if resp.headers.get('Content-encoding', '') == 'gzip':
757 content = resp.read()
758 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
760 uncompressed = io.BytesIO(gz.read())
761 except IOError as original_ioerror:
762 # There may be junk add the end of the file
763 # See http://stackoverflow.com/q/4928560/35070 for details
764 for i in range(1, 1024):
766 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
767 uncompressed = io.BytesIO(gz.read())
772 raise original_ioerror
773 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
774 resp.msg = old_resp.msg
776 if resp.headers.get('Content-encoding', '') == 'deflate':
777 gz = io.BytesIO(self.deflate(resp.read()))
778 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
779 resp.msg = old_resp.msg
780 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
781 # https://github.com/rg3/youtube-dl/issues/6457).
782 if 300 <= resp.code < 400:
783 location = resp.headers.get('Location')
785 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
786 if sys.version_info >= (3, 0):
787 location = location.encode('iso-8859-1').decode('utf-8')
788 location_escaped = escape_url(location)
789 if location != location_escaped:
790 del resp.headers['Location']
791 resp.headers['Location'] = location_escaped
794 https_request = http_request
795 https_response = http_response
798 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
799 def __init__(self, params, https_conn_class=None, *args, **kwargs):
800 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
801 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
802 self._params = params
804 def https_open(self, req):
806 if hasattr(self, '_context'): # python > 2.6
807 kwargs['context'] = self._context
808 if hasattr(self, '_check_hostname'): # python 3.x
809 kwargs['check_hostname'] = self._check_hostname
810 return self.do_open(functools.partial(
811 _create_http_connection, self, self._https_conn_class, True),
815 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
816 def __init__(self, cookiejar=None):
817 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
819 def http_response(self, request, response):
820 # Python 2 will choke on next HTTP request in row if there are non-ASCII
821 # characters in Set-Cookie HTTP header of last response (see
822 # https://github.com/rg3/youtube-dl/issues/6769).
823 # In order to at least prevent crashing we will percent encode Set-Cookie
824 # header before HTTPCookieProcessor starts processing it.
825 # if sys.version_info < (3, 0) and response.headers:
826 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
827 # set_cookie = response.headers.get(set_cookie_header)
829 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
830 # if set_cookie != set_cookie_escaped:
831 # del response.headers[set_cookie_header]
832 # response.headers[set_cookie_header] = set_cookie_escaped
833 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
835 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
836 https_response = http_response
839 def parse_iso8601(date_str, delimiter='T', timezone=None):
840 """ Return a UNIX timestamp from the given date """
845 date_str = re.sub(r'\.[0-9]+', '', date_str)
849 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
852 timezone = datetime.timedelta()
854 date_str = date_str[:-len(m.group(0))]
855 if not m.group('sign'):
856 timezone = datetime.timedelta()
858 sign = 1 if m.group('sign') == '+' else -1
859 timezone = datetime.timedelta(
860 hours=sign * int(m.group('hours')),
861 minutes=sign * int(m.group('minutes')))
863 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
864 dt = datetime.datetime.strptime(date_str, date_format) - timezone
865 return calendar.timegm(dt.timetuple())
870 def unified_strdate(date_str, day_first=True):
871 """Return a string with the date in the format YYYYMMDD"""
877 date_str = date_str.replace(',', ' ')
878 # %z (UTC offset) is only supported in python>=3.2
879 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
880 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
881 # Remove AM/PM + timezone
882 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
884 format_expressions = [
889 '%b %dst %Y %I:%M%p',
890 '%b %dnd %Y %I:%M%p',
891 '%b %dth %Y %I:%M%p',
897 '%Y-%m-%d %H:%M:%S.%f',
900 '%Y-%m-%dT%H:%M:%SZ',
901 '%Y-%m-%dT%H:%M:%S.%fZ',
902 '%Y-%m-%dT%H:%M:%S.%f0Z',
904 '%Y-%m-%dT%H:%M:%S.%f',
908 format_expressions.extend([
916 format_expressions.extend([
923 for expression in format_expressions:
925 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
928 if upload_date is None:
929 timetuple = email.utils.parsedate_tz(date_str)
931 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
932 if upload_date is not None:
933 return compat_str(upload_date)
936 def determine_ext(url, default_ext='unknown_video'):
939 guess = url.partition('?')[0].rpartition('.')[2]
940 if re.match(r'^[A-Za-z0-9]+$', guess):
942 elif guess.rstrip('/') in (
943 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
944 'flv', 'f4v', 'f4a', 'f4b',
945 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
946 'mkv', 'mka', 'mk3d',
955 'f4f', 'f4m', 'm3u8', 'smil'):
956 return guess.rstrip('/')
961 def subtitles_filename(filename, sub_lang, sub_format):
962 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
965 def date_from_str(date_str):
967 Return a datetime object from a string in the format YYYYMMDD or
968 (now|today)[+-][0-9](day|week|month|year)(s)?"""
969 today = datetime.date.today()
970 if date_str in ('now', 'today'):
972 if date_str == 'yesterday':
973 return today - datetime.timedelta(days=1)
974 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
975 if match is not None:
976 sign = match.group('sign')
977 time = int(match.group('time'))
980 unit = match.group('unit')
981 # A bad aproximation?
989 delta = datetime.timedelta(**{unit: time})
991 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
994 def hyphenate_date(date_str):
996 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
997 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
998 if match is not None:
999 return '-'.join(match.groups())
1004 class DateRange(object):
1005 """Represents a time interval between two dates"""
1007 def __init__(self, start=None, end=None):
1008 """start and end must be strings in the format accepted by date"""
1009 if start is not None:
1010 self.start = date_from_str(start)
1012 self.start = datetime.datetime.min.date()
1014 self.end = date_from_str(end)
1016 self.end = datetime.datetime.max.date()
1017 if self.start > self.end:
1018 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1022 """Returns a range that only contains the given day"""
1023 return cls(day, day)
1025 def __contains__(self, date):
1026 """Check if the date is in the range"""
1027 if not isinstance(date, datetime.date):
1028 date = date_from_str(date)
1029 return self.start <= date <= self.end
1032 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1035 def platform_name():
1036 """ Returns the platform name as a compat_str """
1037 res = platform.platform()
1038 if isinstance(res, bytes):
1039 res = res.decode(preferredencoding())
1041 assert isinstance(res, compat_str)
1045 def _windows_write_string(s, out):
1046 """ Returns True if the string was written using special methods,
1047 False if it has yet to be written out."""
1048 # Adapted from http://stackoverflow.com/a/3259271/35070
1051 import ctypes.wintypes
1059 fileno = out.fileno()
1060 except AttributeError:
1061 # If the output stream doesn't have a fileno, it's virtual
1063 except io.UnsupportedOperation:
1064 # Some strange Windows pseudo files?
1066 if fileno not in WIN_OUTPUT_IDS:
1069 GetStdHandle = ctypes.WINFUNCTYPE(
1070 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1071 (b"GetStdHandle", ctypes.windll.kernel32))
1072 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1074 WriteConsoleW = ctypes.WINFUNCTYPE(
1075 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1076 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1077 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1078 written = ctypes.wintypes.DWORD(0)
1080 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1081 FILE_TYPE_CHAR = 0x0002
1082 FILE_TYPE_REMOTE = 0x8000
1083 GetConsoleMode = ctypes.WINFUNCTYPE(
1084 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1085 ctypes.POINTER(ctypes.wintypes.DWORD))(
1086 (b"GetConsoleMode", ctypes.windll.kernel32))
1087 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1089 def not_a_console(handle):
1090 if handle == INVALID_HANDLE_VALUE or handle is None:
1092 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1093 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1095 if not_a_console(h):
1098 def next_nonbmp_pos(s):
1100 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1101 except StopIteration:
1105 count = min(next_nonbmp_pos(s), 1024)
1107 ret = WriteConsoleW(
1108 h, s, count if count else 2, ctypes.byref(written), None)
1110 raise OSError('Failed to write string')
1111 if not count: # We just wrote a non-BMP character
1112 assert written.value == 2
1115 assert written.value > 0
1116 s = s[written.value:]
1120 def write_string(s, out=None, encoding=None):
1123 assert type(s) == compat_str
1125 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1126 if _windows_write_string(s, out):
1129 if ('b' in getattr(out, 'mode', '') or
1130 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1131 byt = s.encode(encoding or preferredencoding(), 'ignore')
1133 elif hasattr(out, 'buffer'):
1134 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1135 byt = s.encode(enc, 'ignore')
1136 out.buffer.write(byt)
1142 def bytes_to_intlist(bs):
1145 if isinstance(bs[0], int): # Python 3
1148 return [ord(c) for c in bs]
1151 def intlist_to_bytes(xs):
1154 return struct_pack('%dB' % len(xs), *xs)
1157 # Cross-platform file locking
1158 if sys.platform == 'win32':
1159 import ctypes.wintypes
1162 class OVERLAPPED(ctypes.Structure):
1164 ('Internal', ctypes.wintypes.LPVOID),
1165 ('InternalHigh', ctypes.wintypes.LPVOID),
1166 ('Offset', ctypes.wintypes.DWORD),
1167 ('OffsetHigh', ctypes.wintypes.DWORD),
1168 ('hEvent', ctypes.wintypes.HANDLE),
1171 kernel32 = ctypes.windll.kernel32
1172 LockFileEx = kernel32.LockFileEx
1173 LockFileEx.argtypes = [
1174 ctypes.wintypes.HANDLE, # hFile
1175 ctypes.wintypes.DWORD, # dwFlags
1176 ctypes.wintypes.DWORD, # dwReserved
1177 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1178 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1179 ctypes.POINTER(OVERLAPPED) # Overlapped
1181 LockFileEx.restype = ctypes.wintypes.BOOL
1182 UnlockFileEx = kernel32.UnlockFileEx
1183 UnlockFileEx.argtypes = [
1184 ctypes.wintypes.HANDLE, # hFile
1185 ctypes.wintypes.DWORD, # dwReserved
1186 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1187 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1188 ctypes.POINTER(OVERLAPPED) # Overlapped
1190 UnlockFileEx.restype = ctypes.wintypes.BOOL
1191 whole_low = 0xffffffff
1192 whole_high = 0x7fffffff
1194 def _lock_file(f, exclusive):
1195 overlapped = OVERLAPPED()
1196 overlapped.Offset = 0
1197 overlapped.OffsetHigh = 0
1198 overlapped.hEvent = 0
1199 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1200 handle = msvcrt.get_osfhandle(f.fileno())
1201 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1202 whole_low, whole_high, f._lock_file_overlapped_p):
1203 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1205 def _unlock_file(f):
1206 assert f._lock_file_overlapped_p
1207 handle = msvcrt.get_osfhandle(f.fileno())
1208 if not UnlockFileEx(handle, 0,
1209 whole_low, whole_high, f._lock_file_overlapped_p):
1210 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1215 def _lock_file(f, exclusive):
1216 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1218 def _unlock_file(f):
1219 fcntl.flock(f, fcntl.LOCK_UN)
1222 class locked_file(object):
1223 def __init__(self, filename, mode, encoding=None):
1224 assert mode in ['r', 'a', 'w']
1225 self.f = io.open(filename, mode, encoding=encoding)
1228 def __enter__(self):
1229 exclusive = self.mode != 'r'
1231 _lock_file(self.f, exclusive)
1237 def __exit__(self, etype, value, traceback):
1239 _unlock_file(self.f)
1246 def write(self, *args):
1247 return self.f.write(*args)
1249 def read(self, *args):
1250 return self.f.read(*args)
1253 def get_filesystem_encoding():
1254 encoding = sys.getfilesystemencoding()
1255 return encoding if encoding is not None else 'utf-8'
1258 def shell_quote(args):
1260 encoding = get_filesystem_encoding()
1262 if isinstance(a, bytes):
1263 # We may get a filename encoded with 'encodeFilename'
1264 a = a.decode(encoding)
1265 quoted_args.append(pipes.quote(a))
1266 return ' '.join(quoted_args)
1269 def smuggle_url(url, data):
1270 """ Pass additional data in a URL for internal use. """
1272 sdata = compat_urllib_parse.urlencode(
1273 {'__youtubedl_smuggle': json.dumps(data)})
1274 return url + '#' + sdata
1277 def unsmuggle_url(smug_url, default=None):
1278 if '#__youtubedl_smuggle' not in smug_url:
1279 return smug_url, default
1280 url, _, sdata = smug_url.rpartition('#')
1281 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1282 data = json.loads(jsond)
1286 def format_bytes(bytes):
1289 if type(bytes) is str:
1290 bytes = float(bytes)
1294 exponent = int(math.log(bytes, 1024.0))
1295 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1296 converted = float(bytes) / float(1024 ** exponent)
1297 return '%.2f%s' % (converted, suffix)
1300 def parse_filesize(s):
1304 # The lower-case forms are of course incorrect and inofficial,
1305 # but we support those too
1343 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1345 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1349 num_str = m.group('num').replace(',', '.')
1350 mult = _UNIT_TABLE[m.group('unit')]
1351 return int(float(num_str) * mult)
1354 def month_by_name(name):
1355 """ Return the number of a month by (locale-independently) English name """
1358 return ENGLISH_MONTH_NAMES.index(name) + 1
1363 def month_by_abbreviation(abbrev):
1364 """ Return the number of a month by (locale-independently) English
1368 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1373 def fix_xml_ampersands(xml_str):
1374 """Replace all the '&' by '&' in XML"""
1376 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1381 def setproctitle(title):
1382 assert isinstance(title, compat_str)
1384 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1387 title_bytes = title.encode('utf-8')
1388 buf = ctypes.create_string_buffer(len(title_bytes))
1389 buf.value = title_bytes
1391 libc.prctl(15, buf, 0, 0, 0)
1392 except AttributeError:
1393 return # Strange libc, just skip this
1396 def remove_start(s, start):
1397 if s.startswith(start):
1398 return s[len(start):]
1402 def remove_end(s, end):
1404 return s[:-len(end)]
1408 def url_basename(url):
1409 path = compat_urlparse.urlparse(url).path
1410 return path.strip('/').split('/')[-1]
1413 class HEADRequest(compat_urllib_request.Request):
1414 def get_method(self):
1418 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1421 v = getattr(v, get_attr, None)
1427 return int(v) * invscale // scale
1432 def str_or_none(v, default=None):
1433 return default if v is None else compat_str(v)
1436 def str_to_int(int_str):
1437 """ A more relaxed version of int_or_none """
1440 int_str = re.sub(r'[,\.\+]', '', int_str)
1444 def float_or_none(v, scale=1, invscale=1, default=None):
1448 return float(v) * invscale / scale
1453 def parse_duration(s):
1454 if not isinstance(s, compat_basestring):
1462 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1463 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1465 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1468 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1469 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1471 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1473 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1478 if m.group('only_mins'):
1479 return float_or_none(m.group('only_mins'), invscale=60)
1480 if m.group('only_hours'):
1481 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1483 res += int(m.group('secs'))
1484 if m.group('mins_reversed'):
1485 res += int(m.group('mins_reversed')) * 60
1487 res += int(m.group('mins')) * 60
1488 if m.group('hours'):
1489 res += int(m.group('hours')) * 60 * 60
1490 if m.group('hours_reversed'):
1491 res += int(m.group('hours_reversed')) * 60 * 60
1493 res += int(m.group('days')) * 24 * 60 * 60
1495 res += float(m.group('ms'))
1499 def prepend_extension(filename, ext, expected_real_ext=None):
1500 name, real_ext = os.path.splitext(filename)
1502 '{0}.{1}{2}'.format(name, ext, real_ext)
1503 if not expected_real_ext or real_ext[1:] == expected_real_ext
1504 else '{0}.{1}'.format(filename, ext))
1507 def replace_extension(filename, ext, expected_real_ext=None):
1508 name, real_ext = os.path.splitext(filename)
1509 return '{0}.{1}'.format(
1510 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1514 def check_executable(exe, args=[]):
1515 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1516 args can be a list of arguments for a short output (like -version) """
1518 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1524 def get_exe_version(exe, args=['--version'],
1525 version_re=None, unrecognized='present'):
1526 """ Returns the version of the specified executable,
1527 or False if the executable is not present """
1529 out, _ = subprocess.Popen(
1530 [encodeArgument(exe)] + args,
1531 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1534 if isinstance(out, bytes): # Python 2.x
1535 out = out.decode('ascii', 'ignore')
1536 return detect_exe_version(out, version_re, unrecognized)
1539 def detect_exe_version(output, version_re=None, unrecognized='present'):
1540 assert isinstance(output, compat_str)
1541 if version_re is None:
1542 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1543 m = re.search(version_re, output)
1550 class PagedList(object):
1552 # This is only useful for tests
1553 return len(self.getslice())
1556 class OnDemandPagedList(PagedList):
1557 def __init__(self, pagefunc, pagesize):
1558 self._pagefunc = pagefunc
1559 self._pagesize = pagesize
1561 def getslice(self, start=0, end=None):
1563 for pagenum in itertools.count(start // self._pagesize):
1564 firstid = pagenum * self._pagesize
1565 nextfirstid = pagenum * self._pagesize + self._pagesize
1566 if start >= nextfirstid:
1569 page_results = list(self._pagefunc(pagenum))
1572 start % self._pagesize
1573 if firstid <= start < nextfirstid
1577 ((end - 1) % self._pagesize) + 1
1578 if (end is not None and firstid <= end <= nextfirstid)
1581 if startv != 0 or endv is not None:
1582 page_results = page_results[startv:endv]
1583 res.extend(page_results)
1585 # A little optimization - if current page is not "full", ie. does
1586 # not contain page_size videos then we can assume that this page
1587 # is the last one - there are no more ids on further pages -
1588 # i.e. no need to query again.
1589 if len(page_results) + startv < self._pagesize:
1592 # If we got the whole page, but the next page is not interesting,
1593 # break out early as well
1594 if end == nextfirstid:
1599 class InAdvancePagedList(PagedList):
1600 def __init__(self, pagefunc, pagecount, pagesize):
1601 self._pagefunc = pagefunc
1602 self._pagecount = pagecount
1603 self._pagesize = pagesize
1605 def getslice(self, start=0, end=None):
1607 start_page = start // self._pagesize
1609 self._pagecount if end is None else (end // self._pagesize + 1))
1610 skip_elems = start - start_page * self._pagesize
1611 only_more = None if end is None else end - start
1612 for pagenum in range(start_page, end_page):
1613 page = list(self._pagefunc(pagenum))
1615 page = page[skip_elems:]
1617 if only_more is not None:
1618 if len(page) < only_more:
1619 only_more -= len(page)
1621 page = page[:only_more]
1628 def uppercase_escape(s):
1629 unicode_escape = codecs.getdecoder('unicode_escape')
1631 r'\\U[0-9a-fA-F]{8}',
1632 lambda m: unicode_escape(m.group(0))[0],
1636 def lowercase_escape(s):
1637 unicode_escape = codecs.getdecoder('unicode_escape')
1639 r'\\u[0-9a-fA-F]{4}',
1640 lambda m: unicode_escape(m.group(0))[0],
1644 def escape_rfc3986(s):
1645 """Escape non-ASCII characters as suggested by RFC 3986"""
1646 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1647 s = s.encode('utf-8')
1648 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1651 def escape_url(url):
1652 """Escape URL as suggested by RFC 3986"""
1653 url_parsed = compat_urllib_parse_urlparse(url)
1654 return url_parsed._replace(
1655 path=escape_rfc3986(url_parsed.path),
1656 params=escape_rfc3986(url_parsed.params),
1657 query=escape_rfc3986(url_parsed.query),
1658 fragment=escape_rfc3986(url_parsed.fragment)
1662 struct.pack('!I', 0)
1664 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1665 def struct_pack(spec, *args):
1666 if isinstance(spec, compat_str):
1667 spec = spec.encode('ascii')
1668 return struct.pack(spec, *args)
1670 def struct_unpack(spec, *args):
1671 if isinstance(spec, compat_str):
1672 spec = spec.encode('ascii')
1673 return struct.unpack(spec, *args)
1675 struct_pack = struct.pack
1676 struct_unpack = struct.unpack
1679 def read_batch_urls(batch_fd):
1681 if not isinstance(url, compat_str):
1682 url = url.decode('utf-8', 'replace')
1683 BOM_UTF8 = '\xef\xbb\xbf'
1684 if url.startswith(BOM_UTF8):
1685 url = url[len(BOM_UTF8):]
1687 if url.startswith(('#', ';', ']')):
1691 with contextlib.closing(batch_fd) as fd:
1692 return [url for url in map(fixup, fd) if url]
1695 def urlencode_postdata(*args, **kargs):
1696 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1699 def encode_dict(d, encoding='utf-8'):
1701 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1702 return dict((encode(k), encode(v)) for k, v in d.items())
1714 def parse_age_limit(s):
1717 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1718 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1721 def strip_jsonp(code):
1723 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1726 def js_to_json(code):
1729 if v in ('true', 'false', 'null'):
1731 if v.startswith('"'):
1732 v = re.sub(r"\\'", "'", v[1:-1])
1733 elif v.startswith("'"):
1735 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1742 res = re.sub(r'''(?x)
1743 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1744 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1745 [a-zA-Z_][.a-zA-Z_0-9]*
1747 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1751 def qualities(quality_ids):
1752 """ Get a numeric quality value out of a list of possible values """
1755 return quality_ids.index(qid)
1761 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1764 def limit_length(s, length):
1765 """ Add ellipses to overly long strings """
1770 return s[:length - len(ELLIPSES)] + ELLIPSES
1774 def version_tuple(v):
1775 return tuple(int(e) for e in re.split(r'[-.]', v))
1778 def is_outdated_version(version, limit, assume_new=True):
1780 return not assume_new
1782 return version_tuple(version) < version_tuple(limit)
1784 return not assume_new
1787 def ytdl_is_updateable():
1788 """ Returns if youtube-dl can be updated with -U """
1789 from zipimport import zipimporter
1791 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1794 def args_to_str(args):
1795 # Get a short string representation for a subprocess command
1796 return ' '.join(shlex_quote(a) for a in args)
1799 def mimetype2ext(mt):
1800 _, _, res = mt.rpartition('/')
1804 'x-mp4-fragmented': 'mp4',
1809 def urlhandle_detect_ext(url_handle):
1812 getheader = lambda h: url_handle.headers[h]
1813 except AttributeError: # Python < 3
1814 getheader = url_handle.info().getheader
1816 cd = getheader('Content-Disposition')
1818 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1820 e = determine_ext(m.group('filename'), default_ext=None)
1824 return mimetype2ext(getheader('Content-Type'))
1827 def encode_data_uri(data, mime_type):
1828 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1831 def age_restricted(content_limit, age_limit):
1832 """ Returns True iff the content should be blocked """
1834 if age_limit is None: # No limit set
1836 if content_limit is None:
1837 return False # Content available for everyone
1838 return age_limit < content_limit
1841 def is_html(first_bytes):
1842 """ Detect whether a file contains HTML by examining its first bytes. """
1845 (b'\xef\xbb\xbf', 'utf-8'),
1846 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1847 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1848 (b'\xff\xfe', 'utf-16-le'),
1849 (b'\xfe\xff', 'utf-16-be'),
1851 for bom, enc in BOMS:
1852 if first_bytes.startswith(bom):
1853 s = first_bytes[len(bom):].decode(enc, 'replace')
1856 s = first_bytes.decode('utf-8', 'replace')
1858 return re.match(r'^\s*<', s)
1861 def determine_protocol(info_dict):
1862 protocol = info_dict.get('protocol')
1863 if protocol is not None:
1866 url = info_dict['url']
1867 if url.startswith('rtmp'):
1869 elif url.startswith('mms'):
1871 elif url.startswith('rtsp'):
1874 ext = determine_ext(url)
1880 return compat_urllib_parse_urlparse(url).scheme
1883 def render_table(header_row, data):
1884 """ Render a list of rows, each as a list of values """
1885 table = [header_row] + data
1886 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1887 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1888 return '\n'.join(format_str % tuple(row) for row in table)
1891 def _match_one(filter_part, dct):
1892 COMPARISON_OPERATORS = {
1900 operator_rex = re.compile(r'''(?x)\s*
1902 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1904 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1905 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1908 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1909 m = operator_rex.search(filter_part)
1911 op = COMPARISON_OPERATORS[m.group('op')]
1912 if m.group('strval') is not None:
1913 if m.group('op') not in ('=', '!='):
1915 'Operator %s does not support string values!' % m.group('op'))
1916 comparison_value = m.group('strval')
1919 comparison_value = int(m.group('intval'))
1921 comparison_value = parse_filesize(m.group('intval'))
1922 if comparison_value is None:
1923 comparison_value = parse_filesize(m.group('intval') + 'B')
1924 if comparison_value is None:
1926 'Invalid integer value %r in filter part %r' % (
1927 m.group('intval'), filter_part))
1928 actual_value = dct.get(m.group('key'))
1929 if actual_value is None:
1930 return m.group('none_inclusive')
1931 return op(actual_value, comparison_value)
1934 '': lambda v: v is not None,
1935 '!': lambda v: v is None,
1937 operator_rex = re.compile(r'''(?x)\s*
1938 (?P<op>%s)\s*(?P<key>[a-z_]+)
1940 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1941 m = operator_rex.search(filter_part)
1943 op = UNARY_OPERATORS[m.group('op')]
1944 actual_value = dct.get(m.group('key'))
1945 return op(actual_value)
1947 raise ValueError('Invalid filter part %r' % filter_part)
1950 def match_str(filter_str, dct):
1951 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1954 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1957 def match_filter_func(filter_str):
1958 def _match_func(info_dict):
1959 if match_str(filter_str, info_dict):
1962 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1963 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1967 def parse_dfxp_time_expr(time_expr):
1971 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1973 return float(mobj.group('time_offset'))
1975 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1977 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1980 def srt_subtitles_timecode(seconds):
1981 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1984 def dfxp2srt(dfxp_data):
1985 _x = functools.partial(xpath_with_ns, ns_map={
1986 'ttml': 'http://www.w3.org/ns/ttml',
1987 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1990 def parse_node(node):
1991 str_or_empty = functools.partial(str_or_none, default='')
1993 out = str_or_empty(node.text)
1996 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1997 out += '\n' + str_or_empty(child.tail)
1998 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1999 out += str_or_empty(parse_node(child))
2001 out += str_or_empty(xml.etree.ElementTree.tostring(child))
2005 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2007 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2010 raise ValueError('Invalid dfxp/TTML subtitle')
2012 for para, index in zip(paras, itertools.count(1)):
2013 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
2014 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2016 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
2017 out.append('%d\n%s --> %s\n%s\n\n' % (
2019 srt_subtitles_timecode(begin_time),
2020 srt_subtitles_timecode(end_time),
2026 def cli_option(params, command_option, param):
2027 param = params.get(param)
2028 return [command_option, param] if param is not None else []
2031 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2032 param = params.get(param)
2033 assert isinstance(param, bool)
2035 return [command_option + separator + (true_value if param else false_value)]
2036 return [command_option, true_value if param else false_value]
2039 def cli_valueless_option(params, command_option, param, expected_value=True):
2040 param = params.get(param)
2041 return [command_option] if param == expected_value else []
2044 def cli_configuration_args(params, param, default=[]):
2045 ex_args = params.get(param)
2048 assert isinstance(ex_args, list)
2052 class ISO639Utils(object):
2053 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2242 def short2long(cls, code):
2243 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2244 return cls._lang_map.get(code[:2])
2247 def long2short(cls, code):
2248 """Convert language code from ISO 639-2/T to ISO 639-1"""
2249 for short_name, long_name in cls._lang_map.items():
2250 if long_name == code:
2254 class ISO3166Utils(object):
2255 # From http://data.okfn.org/data/core/country-list
2257 'AF': 'Afghanistan',
2258 'AX': 'Ã…land Islands',
2261 'AS': 'American Samoa',
2266 'AG': 'Antigua and Barbuda',
2283 'BO': 'Bolivia, Plurinational State of',
2284 'BQ': 'Bonaire, Sint Eustatius and Saba',
2285 'BA': 'Bosnia and Herzegovina',
2287 'BV': 'Bouvet Island',
2289 'IO': 'British Indian Ocean Territory',
2290 'BN': 'Brunei Darussalam',
2292 'BF': 'Burkina Faso',
2298 'KY': 'Cayman Islands',
2299 'CF': 'Central African Republic',
2303 'CX': 'Christmas Island',
2304 'CC': 'Cocos (Keeling) Islands',
2308 'CD': 'Congo, the Democratic Republic of the',
2309 'CK': 'Cook Islands',
2311 'CI': 'Côte d\'Ivoire',
2316 'CZ': 'Czech Republic',
2320 'DO': 'Dominican Republic',
2323 'SV': 'El Salvador',
2324 'GQ': 'Equatorial Guinea',
2328 'FK': 'Falkland Islands (Malvinas)',
2329 'FO': 'Faroe Islands',
2333 'GF': 'French Guiana',
2334 'PF': 'French Polynesia',
2335 'TF': 'French Southern Territories',
2350 'GW': 'Guinea-Bissau',
2353 'HM': 'Heard Island and McDonald Islands',
2354 'VA': 'Holy See (Vatican City State)',
2361 'IR': 'Iran, Islamic Republic of',
2364 'IM': 'Isle of Man',
2374 'KP': 'Korea, Democratic People\'s Republic of',
2375 'KR': 'Korea, Republic of',
2378 'LA': 'Lao People\'s Democratic Republic',
2384 'LI': 'Liechtenstein',
2388 'MK': 'Macedonia, the Former Yugoslav Republic of',
2395 'MH': 'Marshall Islands',
2401 'FM': 'Micronesia, Federated States of',
2402 'MD': 'Moldova, Republic of',
2413 'NL': 'Netherlands',
2414 'NC': 'New Caledonia',
2415 'NZ': 'New Zealand',
2420 'NF': 'Norfolk Island',
2421 'MP': 'Northern Mariana Islands',
2426 'PS': 'Palestine, State of',
2428 'PG': 'Papua New Guinea',
2431 'PH': 'Philippines',
2435 'PR': 'Puerto Rico',
2439 'RU': 'Russian Federation',
2441 'BL': 'Saint Barthélemy',
2442 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2443 'KN': 'Saint Kitts and Nevis',
2444 'LC': 'Saint Lucia',
2445 'MF': 'Saint Martin (French part)',
2446 'PM': 'Saint Pierre and Miquelon',
2447 'VC': 'Saint Vincent and the Grenadines',
2450 'ST': 'Sao Tome and Principe',
2451 'SA': 'Saudi Arabia',
2455 'SL': 'Sierra Leone',
2457 'SX': 'Sint Maarten (Dutch part)',
2460 'SB': 'Solomon Islands',
2462 'ZA': 'South Africa',
2463 'GS': 'South Georgia and the South Sandwich Islands',
2464 'SS': 'South Sudan',
2469 'SJ': 'Svalbard and Jan Mayen',
2472 'CH': 'Switzerland',
2473 'SY': 'Syrian Arab Republic',
2474 'TW': 'Taiwan, Province of China',
2476 'TZ': 'Tanzania, United Republic of',
2478 'TL': 'Timor-Leste',
2482 'TT': 'Trinidad and Tobago',
2485 'TM': 'Turkmenistan',
2486 'TC': 'Turks and Caicos Islands',
2490 'AE': 'United Arab Emirates',
2491 'GB': 'United Kingdom',
2492 'US': 'United States',
2493 'UM': 'United States Minor Outlying Islands',
2497 'VE': 'Venezuela, Bolivarian Republic of',
2499 'VG': 'Virgin Islands, British',
2500 'VI': 'Virgin Islands, U.S.',
2501 'WF': 'Wallis and Futuna',
2502 'EH': 'Western Sahara',
2509 def short2full(cls, code):
2510 """Convert an ISO 3166-2 country code to the corresponding full name"""
2511 return cls._country_map.get(code.upper())
2514 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2515 def __init__(self, proxies=None):
2516 # Set default handlers
2517 for type in ('http', 'https'):
2518 setattr(self, '%s_open' % type,
2519 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2520 meth(r, proxy, type))
2521 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2523 def proxy_open(self, req, proxy, type):
2524 req_proxy = req.headers.get('Ytdl-request-proxy')
2525 if req_proxy is not None:
2527 del req.headers['Ytdl-request-proxy']
2529 if proxy == '__noproxy__':
2530 return None # No Proxy
2531 return compat_urllib_request.ProxyHandler.proxy_open(
2532 self, req, proxy, type)