2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
39 compat_etree_fromstring,
44 compat_socket_create_connection,
48 compat_urllib_parse_urlparse,
49 compat_urllib_request,
55 # This is not clearly defined otherwise
56 compiled_regex_type = type(re.compile(''))
59 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
60 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
61 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
62 'Accept-Encoding': 'gzip, deflate',
63 'Accept-Language': 'en-us,en;q=0.5',
69 ENGLISH_MONTH_NAMES = [
70 'January', 'February', 'March', 'April', 'May', 'June',
71 'July', 'August', 'September', 'October', 'November', 'December']
74 def preferredencoding():
75 """Get preferred encoding.
77 Returns the best encoding scheme for the system, based on
78 locale.getpreferredencoding() and some further tweaks.
81 pref = locale.getpreferredencoding()
89 def write_json_file(obj, fn):
90 """ Encode obj as JSON and write it to fn, atomically if possible """
92 fn = encodeFilename(fn)
93 if sys.version_info < (3, 0) and sys.platform != 'win32':
94 encoding = get_filesystem_encoding()
95 # os.path.basename returns a bytes object, but NamedTemporaryFile
96 # will fail if the filename contains non ascii characters unless we
97 # use a unicode object
98 path_basename = lambda f: os.path.basename(fn).decode(encoding)
99 # the same for os.path.dirname
100 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
102 path_basename = os.path.basename
103 path_dirname = os.path.dirname
107 'prefix': path_basename(fn) + '.',
108 'dir': path_dirname(fn),
112 # In Python 2.x, json.dump expects a bytestream.
113 # In Python 3.x, it writes to a character stream
114 if sys.version_info < (3, 0):
122 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
127 if sys.platform == 'win32':
128 # Need to remove existing file on Windows, else os.rename raises
129 # WindowsError or FileExistsError.
134 os.rename(tf.name, fn)
143 if sys.version_info >= (2, 7):
144 def find_xpath_attr(node, xpath, key, val=None):
145 """ Find the xpath xpath[@key=val] """
146 assert re.match(r'^[a-zA-Z_-]+$', key)
148 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
149 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
150 return node.find(expr)
152 def find_xpath_attr(node, xpath, key, val=None):
153 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
154 # .//node does not match if a node is a direct child of . !
155 if isinstance(xpath, compat_str):
156 xpath = xpath.encode('ascii')
158 for f in node.findall(xpath):
159 if key not in f.attrib:
161 if val is None or f.attrib.get(key) == val:
165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
166 # the namespace parameter
169 def xpath_with_ns(path, ns_map):
170 components = [c.split(':') for c in path.split('/')]
174 replaced.append(c[0])
177 replaced.append('{%s}%s' % (ns_map[ns], tag))
178 return '/'.join(replaced)
181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
182 def _find_xpath(xpath):
183 if sys.version_info < (2, 7): # Crazy 2.6
184 xpath = xpath.encode('ascii')
185 return node.find(xpath)
187 if isinstance(xpath, (str, compat_str)):
188 n = _find_xpath(xpath)
196 if default is not NO_DEFAULT:
199 name = xpath if name is None else name
200 raise ExtractorError('Could not find XML element %s' % name)
206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
207 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
208 if n is None or n == default:
211 if default is not NO_DEFAULT:
214 name = xpath if name is None else name
215 raise ExtractorError('Could not find XML element\'s text %s' % name)
221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
222 n = find_xpath_attr(node, xpath, key)
224 if default is not NO_DEFAULT:
227 name = '%s[@%s]' % (xpath, key) if name is None else name
228 raise ExtractorError('Could not find XML attribute %s' % name)
234 def get_element_by_id(id, html):
235 """Return the content of the tag with the specified ID in the passed HTML document"""
236 return get_element_by_attribute("id", id, html)
239 def get_element_by_attribute(attribute, value, html):
240 """Return the content of the tag with the specified attribute in the passed HTML document"""
242 m = re.search(r'''(?xs)
244 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
246 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
250 ''' % (re.escape(attribute), re.escape(value)), html)
254 res = m.group('content')
256 if res.startswith('"') or res.startswith("'"):
259 return unescapeHTML(res)
262 def clean_html(html):
263 """Clean an HTML snippet into a readable string"""
265 if html is None: # Convenience for sanitizing descriptions etc.
269 html = html.replace('\n', ' ')
270 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
271 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
273 html = re.sub('<.*?>', '', html)
274 # Replace html entities
275 html = unescapeHTML(html)
279 def sanitize_open(filename, open_mode):
280 """Try to open the given filename, and slightly tweak it if this fails.
282 Attempts to open the given filename. If this fails, it tries to change
283 the filename slightly, step by step, until it's either able to open it
284 or it fails and raises a final exception, like the standard open()
287 It returns the tuple (stream, definitive_file_name).
291 if sys.platform == 'win32':
293 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
294 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
295 stream = open(encodeFilename(filename), open_mode)
296 return (stream, filename)
297 except (IOError, OSError) as err:
298 if err.errno in (errno.EACCES,):
301 # In case of error, try to remove win32 forbidden chars
302 alt_filename = sanitize_path(filename)
303 if alt_filename == filename:
306 # An exception here should be caught in the caller
307 stream = open(encodeFilename(alt_filename), open_mode)
308 return (stream, alt_filename)
311 def timeconvert(timestr):
312 """Convert RFC 2822 defined time string into system timestamp"""
314 timetuple = email.utils.parsedate_tz(timestr)
315 if timetuple is not None:
316 timestamp = email.utils.mktime_tz(timetuple)
320 def sanitize_filename(s, restricted=False, is_id=False):
321 """Sanitizes a string so it could be used as part of a filename.
322 If restricted is set, use a stricter subset of allowed characters.
323 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
325 def replace_insane(char):
326 if char == '?' or ord(char) < 32 or ord(char) == 127:
329 return '' if restricted else '\''
331 return '_-' if restricted else ' -'
332 elif char in '\\/|*<>':
334 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
336 if restricted and ord(char) > 127:
341 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
342 result = ''.join(map(replace_insane, s))
344 while '__' in result:
345 result = result.replace('__', '_')
346 result = result.strip('_')
347 # Common case of "Foreign band name - English song title"
348 if restricted and result.startswith('-_'):
350 if result.startswith('-'):
351 result = '_' + result[len('-'):]
352 result = result.lstrip('.')
358 def sanitize_path(s):
359 """Sanitizes and normalizes path on Windows"""
360 if sys.platform != 'win32':
362 drive_or_unc, _ = os.path.splitdrive(s)
363 if sys.version_info < (2, 7) and not drive_or_unc:
364 drive_or_unc, _ = os.path.splitunc(s)
365 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
369 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
370 for path_part in norm_path]
372 sanitized_path.insert(0, drive_or_unc + os.path.sep)
373 return os.path.join(*sanitized_path)
376 def orderedSet(iterable):
377 """ Remove all duplicates from the input iterable """
385 def _htmlentity_transform(entity):
386 """Transforms an HTML entity to a character."""
387 # Known non-numeric HTML entity
388 if entity in compat_html_entities.name2codepoint:
389 return compat_chr(compat_html_entities.name2codepoint[entity])
391 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
393 numstr = mobj.group(1)
394 if numstr.startswith('x'):
396 numstr = '0%s' % numstr
399 # See https://github.com/rg3/youtube-dl/issues/7518
401 return compat_chr(int(numstr, base))
405 # Unknown entity in name, return its literal representation
406 return '&%s;' % entity
412 assert type(s) == compat_str
415 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
418 def get_subprocess_encoding():
419 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
420 # For subprocess calls, encode with locale encoding
421 # Refer to http://stackoverflow.com/a/9951851/35070
422 encoding = preferredencoding()
424 encoding = sys.getfilesystemencoding()
430 def encodeFilename(s, for_subprocess=False):
432 @param s The name of the file
435 assert type(s) == compat_str
437 # Python 3 has a Unicode API
438 if sys.version_info >= (3, 0):
441 # Pass '' directly to use Unicode APIs on Windows 2000 and up
442 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
443 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
444 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
447 return s.encode(get_subprocess_encoding(), 'ignore')
450 def decodeFilename(b, for_subprocess=False):
452 if sys.version_info >= (3, 0):
455 if not isinstance(b, bytes):
458 return b.decode(get_subprocess_encoding(), 'ignore')
461 def encodeArgument(s):
462 if not isinstance(s, compat_str):
463 # Legacy code that uses byte strings
464 # Uncomment the following line after fixing all post processors
465 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
466 s = s.decode('ascii')
467 return encodeFilename(s, True)
470 def decodeArgument(b):
471 return decodeFilename(b, True)
474 def decodeOption(optval):
477 if isinstance(optval, bytes):
478 optval = optval.decode(preferredencoding())
480 assert isinstance(optval, compat_str)
484 def formatSeconds(secs):
486 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
488 return '%d:%02d' % (secs // 60, secs % 60)
493 def make_HTTPS_handler(params, **kwargs):
494 opts_no_check_certificate = params.get('nocheckcertificate', False)
495 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
496 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
497 if opts_no_check_certificate:
498 context.check_hostname = False
499 context.verify_mode = ssl.CERT_NONE
501 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
504 # (create_default_context present but HTTPSHandler has no context=)
507 if sys.version_info < (3, 2):
508 return YoutubeDLHTTPSHandler(params, **kwargs)
510 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
511 context.verify_mode = (ssl.CERT_NONE
512 if opts_no_check_certificate
513 else ssl.CERT_REQUIRED)
514 context.set_default_verify_paths()
515 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
518 def bug_reports_message():
519 if ytdl_is_updateable():
520 update_cmd = 'type youtube-dl -U to update'
522 update_cmd = 'see https://yt-dl.org/update on how to update'
523 msg = '; please report this issue on https://yt-dl.org/bug .'
524 msg += ' Make sure you are using the latest version; %s.' % update_cmd
525 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
529 class ExtractorError(Exception):
530 """Error during info extraction."""
532 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
533 """ tb, if given, is the original traceback (so that it can be printed out).
534 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
537 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
539 if video_id is not None:
540 msg = video_id + ': ' + msg
542 msg += ' (caused by %r)' % cause
544 msg += bug_reports_message()
545 super(ExtractorError, self).__init__(msg)
548 self.exc_info = sys.exc_info() # preserve original exception
550 self.video_id = video_id
552 def format_traceback(self):
553 if self.traceback is None:
555 return ''.join(traceback.format_tb(self.traceback))
558 class UnsupportedError(ExtractorError):
559 def __init__(self, url):
560 super(UnsupportedError, self).__init__(
561 'Unsupported URL: %s' % url, expected=True)
565 class RegexNotFoundError(ExtractorError):
566 """Error when a regex didn't match"""
570 class DownloadError(Exception):
571 """Download Error exception.
573 This exception may be thrown by FileDownloader objects if they are not
574 configured to continue on errors. They will contain the appropriate
578 def __init__(self, msg, exc_info=None):
579 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
580 super(DownloadError, self).__init__(msg)
581 self.exc_info = exc_info
584 class SameFileError(Exception):
585 """Same File exception.
587 This exception will be thrown by FileDownloader objects if they detect
588 multiple files would have to be downloaded to the same file on disk.
593 class PostProcessingError(Exception):
594 """Post Processing exception.
596 This exception may be raised by PostProcessor's .run() method to
597 indicate an error in the postprocessing task.
600 def __init__(self, msg):
604 class MaxDownloadsReached(Exception):
605 """ --max-downloads limit has been reached. """
609 class UnavailableVideoError(Exception):
610 """Unavailable Format exception.
612 This exception will be thrown when a video is requested
613 in a format that is not available for that video.
618 class ContentTooShortError(Exception):
619 """Content Too Short exception.
621 This exception may be raised by FileDownloader objects when a file they
622 download is too small for what the server announced first, indicating
623 the connection was probably interrupted.
626 def __init__(self, downloaded, expected):
628 self.downloaded = downloaded
629 self.expected = expected
632 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
633 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
634 # expected HTTP responses to meet HTTP/1.0 or later (see also
635 # https://github.com/rg3/youtube-dl/issues/6727)
636 if sys.version_info < (3, 0):
637 kwargs[b'strict'] = True
638 hc = http_class(*args, **kwargs)
639 source_address = ydl_handler._params.get('source_address')
640 if source_address is not None:
641 sa = (source_address, 0)
642 if hasattr(hc, 'source_address'): # Python 2.7+
643 hc.source_address = sa
645 def _hc_connect(self, *args, **kwargs):
646 sock = compat_socket_create_connection(
647 (self.host, self.port), self.timeout, sa)
649 self.sock = ssl.wrap_socket(
650 sock, self.key_file, self.cert_file,
651 ssl_version=ssl.PROTOCOL_TLSv1)
654 hc.connect = functools.partial(_hc_connect, hc)
659 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
660 """Handler for HTTP requests and responses.
662 This class, when installed with an OpenerDirector, automatically adds
663 the standard headers to every HTTP request and handles gzipped and
664 deflated responses from web servers. If compression is to be avoided in
665 a particular request, the original request in the program code only has
666 to include the HTTP header "Youtubedl-No-Compression", which will be
667 removed before making the real request.
669 Part of this code was copied from:
671 http://techknack.net/python-urllib2-handlers/
673 Andrew Rowls, the author of that code, agreed to release it to the
677 def __init__(self, params, *args, **kwargs):
678 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
679 self._params = params
681 def http_open(self, req):
682 return self.do_open(functools.partial(
683 _create_http_connection, self, compat_http_client.HTTPConnection, False),
689 return zlib.decompress(data, -zlib.MAX_WBITS)
691 return zlib.decompress(data)
694 def addinfourl_wrapper(stream, headers, url, code):
695 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
696 return compat_urllib_request.addinfourl(stream, headers, url, code)
697 ret = compat_urllib_request.addinfourl(stream, headers, url)
701 def http_request(self, req):
702 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
703 # always respected by websites, some tend to give out URLs with non percent-encoded
704 # non-ASCII characters (see telemb.py, ard.py [#3412])
705 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
706 # To work around aforementioned issue we will replace request's original URL with
707 # percent-encoded one
708 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
709 # the code of this workaround has been moved here from YoutubeDL.urlopen()
710 url = req.get_full_url()
711 url_escaped = escape_url(url)
713 # Substitute URL if any change after escaping
714 if url != url_escaped:
715 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
717 url_escaped, data=req.data, headers=req.headers,
718 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
719 new_req.timeout = req.timeout
722 for h, v in std_headers.items():
723 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
724 # The dict keys are capitalized because of this bug by urllib
725 if h.capitalize() not in req.headers:
727 if 'Youtubedl-no-compression' in req.headers:
728 if 'Accept-encoding' in req.headers:
729 del req.headers['Accept-encoding']
730 del req.headers['Youtubedl-no-compression']
732 if sys.version_info < (2, 7) and '#' in req.get_full_url():
733 # Python 2.6 is brain-dead when it comes to fragments
734 req._Request__original = req._Request__original.partition('#')[0]
735 req._Request__r_type = req._Request__r_type.partition('#')[0]
739 def http_response(self, req, resp):
742 if resp.headers.get('Content-encoding', '') == 'gzip':
743 content = resp.read()
744 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
746 uncompressed = io.BytesIO(gz.read())
747 except IOError as original_ioerror:
748 # There may be junk add the end of the file
749 # See http://stackoverflow.com/q/4928560/35070 for details
750 for i in range(1, 1024):
752 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
753 uncompressed = io.BytesIO(gz.read())
758 raise original_ioerror
759 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
760 resp.msg = old_resp.msg
762 if resp.headers.get('Content-encoding', '') == 'deflate':
763 gz = io.BytesIO(self.deflate(resp.read()))
764 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
765 resp.msg = old_resp.msg
766 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
767 # https://github.com/rg3/youtube-dl/issues/6457).
768 if 300 <= resp.code < 400:
769 location = resp.headers.get('Location')
771 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
772 if sys.version_info >= (3, 0):
773 location = location.encode('iso-8859-1').decode('utf-8')
774 location_escaped = escape_url(location)
775 if location != location_escaped:
776 del resp.headers['Location']
777 resp.headers['Location'] = location_escaped
780 https_request = http_request
781 https_response = http_response
784 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
785 def __init__(self, params, https_conn_class=None, *args, **kwargs):
786 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
787 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
788 self._params = params
790 def https_open(self, req):
792 if hasattr(self, '_context'): # python > 2.6
793 kwargs['context'] = self._context
794 if hasattr(self, '_check_hostname'): # python 3.x
795 kwargs['check_hostname'] = self._check_hostname
796 return self.do_open(functools.partial(
797 _create_http_connection, self, self._https_conn_class, True),
801 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
802 def __init__(self, cookiejar=None):
803 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
805 def http_response(self, request, response):
806 # Python 2 will choke on next HTTP request in row if there are non-ASCII
807 # characters in Set-Cookie HTTP header of last response (see
808 # https://github.com/rg3/youtube-dl/issues/6769).
809 # In order to at least prevent crashing we will percent encode Set-Cookie
810 # header before HTTPCookieProcessor starts processing it.
811 # if sys.version_info < (3, 0) and response.headers:
812 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
813 # set_cookie = response.headers.get(set_cookie_header)
815 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
816 # if set_cookie != set_cookie_escaped:
817 # del response.headers[set_cookie_header]
818 # response.headers[set_cookie_header] = set_cookie_escaped
819 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
821 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
822 https_response = http_response
825 def parse_iso8601(date_str, delimiter='T', timezone=None):
826 """ Return a UNIX timestamp from the given date """
831 date_str = re.sub(r'\.[0-9]+', '', date_str)
835 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
838 timezone = datetime.timedelta()
840 date_str = date_str[:-len(m.group(0))]
841 if not m.group('sign'):
842 timezone = datetime.timedelta()
844 sign = 1 if m.group('sign') == '+' else -1
845 timezone = datetime.timedelta(
846 hours=sign * int(m.group('hours')),
847 minutes=sign * int(m.group('minutes')))
849 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
850 dt = datetime.datetime.strptime(date_str, date_format) - timezone
851 return calendar.timegm(dt.timetuple())
856 def unified_strdate(date_str, day_first=True):
857 """Return a string with the date in the format YYYYMMDD"""
863 date_str = date_str.replace(',', ' ')
864 # %z (UTC offset) is only supported in python>=3.2
865 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
866 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
867 # Remove AM/PM + timezone
868 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
870 format_expressions = [
875 '%b %dst %Y %I:%M%p',
876 '%b %dnd %Y %I:%M%p',
877 '%b %dth %Y %I:%M%p',
883 '%Y-%m-%d %H:%M:%S.%f',
886 '%Y-%m-%dT%H:%M:%SZ',
887 '%Y-%m-%dT%H:%M:%S.%fZ',
888 '%Y-%m-%dT%H:%M:%S.%f0Z',
890 '%Y-%m-%dT%H:%M:%S.%f',
894 format_expressions.extend([
902 format_expressions.extend([
909 for expression in format_expressions:
911 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
914 if upload_date is None:
915 timetuple = email.utils.parsedate_tz(date_str)
917 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
918 if upload_date is not None:
919 return compat_str(upload_date)
922 def determine_ext(url, default_ext='unknown_video'):
925 guess = url.partition('?')[0].rpartition('.')[2]
926 if re.match(r'^[A-Za-z0-9]+$', guess):
928 elif guess.rstrip('/') in (
929 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
930 'flv', 'f4v', 'f4a', 'f4b',
931 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
932 'mkv', 'mka', 'mk3d',
941 'f4f', 'f4m', 'm3u8', 'smil'):
942 return guess.rstrip('/')
947 def subtitles_filename(filename, sub_lang, sub_format):
948 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
951 def date_from_str(date_str):
953 Return a datetime object from a string in the format YYYYMMDD or
954 (now|today)[+-][0-9](day|week|month|year)(s)?"""
955 today = datetime.date.today()
956 if date_str in ('now', 'today'):
958 if date_str == 'yesterday':
959 return today - datetime.timedelta(days=1)
960 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
961 if match is not None:
962 sign = match.group('sign')
963 time = int(match.group('time'))
966 unit = match.group('unit')
967 # A bad aproximation?
975 delta = datetime.timedelta(**{unit: time})
977 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
980 def hyphenate_date(date_str):
982 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
983 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
984 if match is not None:
985 return '-'.join(match.groups())
990 class DateRange(object):
991 """Represents a time interval between two dates"""
993 def __init__(self, start=None, end=None):
994 """start and end must be strings in the format accepted by date"""
995 if start is not None:
996 self.start = date_from_str(start)
998 self.start = datetime.datetime.min.date()
1000 self.end = date_from_str(end)
1002 self.end = datetime.datetime.max.date()
1003 if self.start > self.end:
1004 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1008 """Returns a range that only contains the given day"""
1009 return cls(day, day)
1011 def __contains__(self, date):
1012 """Check if the date is in the range"""
1013 if not isinstance(date, datetime.date):
1014 date = date_from_str(date)
1015 return self.start <= date <= self.end
1018 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1021 def platform_name():
1022 """ Returns the platform name as a compat_str """
1023 res = platform.platform()
1024 if isinstance(res, bytes):
1025 res = res.decode(preferredencoding())
1027 assert isinstance(res, compat_str)
1031 def _windows_write_string(s, out):
1032 """ Returns True if the string was written using special methods,
1033 False if it has yet to be written out."""
1034 # Adapted from http://stackoverflow.com/a/3259271/35070
1037 import ctypes.wintypes
1045 fileno = out.fileno()
1046 except AttributeError:
1047 # If the output stream doesn't have a fileno, it's virtual
1049 except io.UnsupportedOperation:
1050 # Some strange Windows pseudo files?
1052 if fileno not in WIN_OUTPUT_IDS:
1055 GetStdHandle = ctypes.WINFUNCTYPE(
1056 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1057 (b"GetStdHandle", ctypes.windll.kernel32))
1058 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1060 WriteConsoleW = ctypes.WINFUNCTYPE(
1061 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1062 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1063 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1064 written = ctypes.wintypes.DWORD(0)
1066 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1067 FILE_TYPE_CHAR = 0x0002
1068 FILE_TYPE_REMOTE = 0x8000
1069 GetConsoleMode = ctypes.WINFUNCTYPE(
1070 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1071 ctypes.POINTER(ctypes.wintypes.DWORD))(
1072 (b"GetConsoleMode", ctypes.windll.kernel32))
1073 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1075 def not_a_console(handle):
1076 if handle == INVALID_HANDLE_VALUE or handle is None:
1078 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1079 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1081 if not_a_console(h):
1084 def next_nonbmp_pos(s):
1086 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1087 except StopIteration:
1091 count = min(next_nonbmp_pos(s), 1024)
1093 ret = WriteConsoleW(
1094 h, s, count if count else 2, ctypes.byref(written), None)
1096 raise OSError('Failed to write string')
1097 if not count: # We just wrote a non-BMP character
1098 assert written.value == 2
1101 assert written.value > 0
1102 s = s[written.value:]
1106 def write_string(s, out=None, encoding=None):
1109 assert type(s) == compat_str
1111 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1112 if _windows_write_string(s, out):
1115 if ('b' in getattr(out, 'mode', '') or
1116 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1117 byt = s.encode(encoding or preferredencoding(), 'ignore')
1119 elif hasattr(out, 'buffer'):
1120 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1121 byt = s.encode(enc, 'ignore')
1122 out.buffer.write(byt)
1128 def bytes_to_intlist(bs):
1131 if isinstance(bs[0], int): # Python 3
1134 return [ord(c) for c in bs]
1137 def intlist_to_bytes(xs):
1140 return struct_pack('%dB' % len(xs), *xs)
1143 # Cross-platform file locking
1144 if sys.platform == 'win32':
1145 import ctypes.wintypes
1148 class OVERLAPPED(ctypes.Structure):
1150 ('Internal', ctypes.wintypes.LPVOID),
1151 ('InternalHigh', ctypes.wintypes.LPVOID),
1152 ('Offset', ctypes.wintypes.DWORD),
1153 ('OffsetHigh', ctypes.wintypes.DWORD),
1154 ('hEvent', ctypes.wintypes.HANDLE),
1157 kernel32 = ctypes.windll.kernel32
1158 LockFileEx = kernel32.LockFileEx
1159 LockFileEx.argtypes = [
1160 ctypes.wintypes.HANDLE, # hFile
1161 ctypes.wintypes.DWORD, # dwFlags
1162 ctypes.wintypes.DWORD, # dwReserved
1163 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1164 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1165 ctypes.POINTER(OVERLAPPED) # Overlapped
1167 LockFileEx.restype = ctypes.wintypes.BOOL
1168 UnlockFileEx = kernel32.UnlockFileEx
1169 UnlockFileEx.argtypes = [
1170 ctypes.wintypes.HANDLE, # hFile
1171 ctypes.wintypes.DWORD, # dwReserved
1172 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1173 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1174 ctypes.POINTER(OVERLAPPED) # Overlapped
1176 UnlockFileEx.restype = ctypes.wintypes.BOOL
1177 whole_low = 0xffffffff
1178 whole_high = 0x7fffffff
1180 def _lock_file(f, exclusive):
1181 overlapped = OVERLAPPED()
1182 overlapped.Offset = 0
1183 overlapped.OffsetHigh = 0
1184 overlapped.hEvent = 0
1185 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1186 handle = msvcrt.get_osfhandle(f.fileno())
1187 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1188 whole_low, whole_high, f._lock_file_overlapped_p):
1189 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1191 def _unlock_file(f):
1192 assert f._lock_file_overlapped_p
1193 handle = msvcrt.get_osfhandle(f.fileno())
1194 if not UnlockFileEx(handle, 0,
1195 whole_low, whole_high, f._lock_file_overlapped_p):
1196 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1201 def _lock_file(f, exclusive):
1202 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1204 def _unlock_file(f):
1205 fcntl.flock(f, fcntl.LOCK_UN)
1208 class locked_file(object):
1209 def __init__(self, filename, mode, encoding=None):
1210 assert mode in ['r', 'a', 'w']
1211 self.f = io.open(filename, mode, encoding=encoding)
1214 def __enter__(self):
1215 exclusive = self.mode != 'r'
1217 _lock_file(self.f, exclusive)
1223 def __exit__(self, etype, value, traceback):
1225 _unlock_file(self.f)
1232 def write(self, *args):
1233 return self.f.write(*args)
1235 def read(self, *args):
1236 return self.f.read(*args)
1239 def get_filesystem_encoding():
1240 encoding = sys.getfilesystemencoding()
1241 return encoding if encoding is not None else 'utf-8'
1244 def shell_quote(args):
1246 encoding = get_filesystem_encoding()
1248 if isinstance(a, bytes):
1249 # We may get a filename encoded with 'encodeFilename'
1250 a = a.decode(encoding)
1251 quoted_args.append(pipes.quote(a))
1252 return ' '.join(quoted_args)
1255 def smuggle_url(url, data):
1256 """ Pass additional data in a URL for internal use. """
1258 sdata = compat_urllib_parse.urlencode(
1259 {'__youtubedl_smuggle': json.dumps(data)})
1260 return url + '#' + sdata
1263 def unsmuggle_url(smug_url, default=None):
1264 if '#__youtubedl_smuggle' not in smug_url:
1265 return smug_url, default
1266 url, _, sdata = smug_url.rpartition('#')
1267 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1268 data = json.loads(jsond)
1272 def format_bytes(bytes):
1275 if type(bytes) is str:
1276 bytes = float(bytes)
1280 exponent = int(math.log(bytes, 1024.0))
1281 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1282 converted = float(bytes) / float(1024 ** exponent)
1283 return '%.2f%s' % (converted, suffix)
1286 def parse_filesize(s):
1290 # The lower-case forms are of course incorrect and inofficial,
1291 # but we support those too
1329 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1331 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1335 num_str = m.group('num').replace(',', '.')
1336 mult = _UNIT_TABLE[m.group('unit')]
1337 return int(float(num_str) * mult)
1340 def month_by_name(name):
1341 """ Return the number of a month by (locale-independently) English name """
1344 return ENGLISH_MONTH_NAMES.index(name) + 1
1349 def month_by_abbreviation(abbrev):
1350 """ Return the number of a month by (locale-independently) English
1354 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1359 def fix_xml_ampersands(xml_str):
1360 """Replace all the '&' by '&' in XML"""
1362 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1367 def setproctitle(title):
1368 assert isinstance(title, compat_str)
1370 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1373 title_bytes = title.encode('utf-8')
1374 buf = ctypes.create_string_buffer(len(title_bytes))
1375 buf.value = title_bytes
1377 libc.prctl(15, buf, 0, 0, 0)
1378 except AttributeError:
1379 return # Strange libc, just skip this
1382 def remove_start(s, start):
1383 if s.startswith(start):
1384 return s[len(start):]
1388 def remove_end(s, end):
1390 return s[:-len(end)]
1394 def url_basename(url):
1395 path = compat_urlparse.urlparse(url).path
1396 return path.strip('/').split('/')[-1]
1399 class HEADRequest(compat_urllib_request.Request):
1400 def get_method(self):
1404 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1407 v = getattr(v, get_attr, None)
1413 return int(v) * invscale // scale
1418 def str_or_none(v, default=None):
1419 return default if v is None else compat_str(v)
1422 def str_to_int(int_str):
1423 """ A more relaxed version of int_or_none """
1426 int_str = re.sub(r'[,\.\+]', '', int_str)
1430 def float_or_none(v, scale=1, invscale=1, default=None):
1434 return float(v) * invscale / scale
1439 def parse_duration(s):
1440 if not isinstance(s, compat_basestring):
1448 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1449 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1451 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1454 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1455 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1457 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1459 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1464 if m.group('only_mins'):
1465 return float_or_none(m.group('only_mins'), invscale=60)
1466 if m.group('only_hours'):
1467 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1469 res += int(m.group('secs'))
1470 if m.group('mins_reversed'):
1471 res += int(m.group('mins_reversed')) * 60
1473 res += int(m.group('mins')) * 60
1474 if m.group('hours'):
1475 res += int(m.group('hours')) * 60 * 60
1476 if m.group('hours_reversed'):
1477 res += int(m.group('hours_reversed')) * 60 * 60
1479 res += int(m.group('days')) * 24 * 60 * 60
1481 res += float(m.group('ms'))
1485 def prepend_extension(filename, ext, expected_real_ext=None):
1486 name, real_ext = os.path.splitext(filename)
1488 '{0}.{1}{2}'.format(name, ext, real_ext)
1489 if not expected_real_ext or real_ext[1:] == expected_real_ext
1490 else '{0}.{1}'.format(filename, ext))
1493 def replace_extension(filename, ext, expected_real_ext=None):
1494 name, real_ext = os.path.splitext(filename)
1495 return '{0}.{1}'.format(
1496 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1500 def check_executable(exe, args=[]):
1501 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1502 args can be a list of arguments for a short output (like -version) """
1504 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1510 def get_exe_version(exe, args=['--version'],
1511 version_re=None, unrecognized='present'):
1512 """ Returns the version of the specified executable,
1513 or False if the executable is not present """
1515 out, _ = subprocess.Popen(
1516 [encodeArgument(exe)] + args,
1517 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1520 if isinstance(out, bytes): # Python 2.x
1521 out = out.decode('ascii', 'ignore')
1522 return detect_exe_version(out, version_re, unrecognized)
1525 def detect_exe_version(output, version_re=None, unrecognized='present'):
1526 assert isinstance(output, compat_str)
1527 if version_re is None:
1528 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1529 m = re.search(version_re, output)
1536 class PagedList(object):
1538 # This is only useful for tests
1539 return len(self.getslice())
1542 class OnDemandPagedList(PagedList):
1543 def __init__(self, pagefunc, pagesize):
1544 self._pagefunc = pagefunc
1545 self._pagesize = pagesize
1547 def getslice(self, start=0, end=None):
1549 for pagenum in itertools.count(start // self._pagesize):
1550 firstid = pagenum * self._pagesize
1551 nextfirstid = pagenum * self._pagesize + self._pagesize
1552 if start >= nextfirstid:
1555 page_results = list(self._pagefunc(pagenum))
1558 start % self._pagesize
1559 if firstid <= start < nextfirstid
1563 ((end - 1) % self._pagesize) + 1
1564 if (end is not None and firstid <= end <= nextfirstid)
1567 if startv != 0 or endv is not None:
1568 page_results = page_results[startv:endv]
1569 res.extend(page_results)
1571 # A little optimization - if current page is not "full", ie. does
1572 # not contain page_size videos then we can assume that this page
1573 # is the last one - there are no more ids on further pages -
1574 # i.e. no need to query again.
1575 if len(page_results) + startv < self._pagesize:
1578 # If we got the whole page, but the next page is not interesting,
1579 # break out early as well
1580 if end == nextfirstid:
1585 class InAdvancePagedList(PagedList):
1586 def __init__(self, pagefunc, pagecount, pagesize):
1587 self._pagefunc = pagefunc
1588 self._pagecount = pagecount
1589 self._pagesize = pagesize
1591 def getslice(self, start=0, end=None):
1593 start_page = start // self._pagesize
1595 self._pagecount if end is None else (end // self._pagesize + 1))
1596 skip_elems = start - start_page * self._pagesize
1597 only_more = None if end is None else end - start
1598 for pagenum in range(start_page, end_page):
1599 page = list(self._pagefunc(pagenum))
1601 page = page[skip_elems:]
1603 if only_more is not None:
1604 if len(page) < only_more:
1605 only_more -= len(page)
1607 page = page[:only_more]
1614 def uppercase_escape(s):
1615 unicode_escape = codecs.getdecoder('unicode_escape')
1617 r'\\U[0-9a-fA-F]{8}',
1618 lambda m: unicode_escape(m.group(0))[0],
1622 def lowercase_escape(s):
1623 unicode_escape = codecs.getdecoder('unicode_escape')
1625 r'\\u[0-9a-fA-F]{4}',
1626 lambda m: unicode_escape(m.group(0))[0],
1630 def escape_rfc3986(s):
1631 """Escape non-ASCII characters as suggested by RFC 3986"""
1632 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1633 s = s.encode('utf-8')
1634 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1637 def escape_url(url):
1638 """Escape URL as suggested by RFC 3986"""
1639 url_parsed = compat_urllib_parse_urlparse(url)
1640 return url_parsed._replace(
1641 path=escape_rfc3986(url_parsed.path),
1642 params=escape_rfc3986(url_parsed.params),
1643 query=escape_rfc3986(url_parsed.query),
1644 fragment=escape_rfc3986(url_parsed.fragment)
1648 struct.pack('!I', 0)
1650 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1651 def struct_pack(spec, *args):
1652 if isinstance(spec, compat_str):
1653 spec = spec.encode('ascii')
1654 return struct.pack(spec, *args)
1656 def struct_unpack(spec, *args):
1657 if isinstance(spec, compat_str):
1658 spec = spec.encode('ascii')
1659 return struct.unpack(spec, *args)
1661 struct_pack = struct.pack
1662 struct_unpack = struct.unpack
1665 def read_batch_urls(batch_fd):
1667 if not isinstance(url, compat_str):
1668 url = url.decode('utf-8', 'replace')
1669 BOM_UTF8 = '\xef\xbb\xbf'
1670 if url.startswith(BOM_UTF8):
1671 url = url[len(BOM_UTF8):]
1673 if url.startswith(('#', ';', ']')):
1677 with contextlib.closing(batch_fd) as fd:
1678 return [url for url in map(fixup, fd) if url]
1681 def urlencode_postdata(*args, **kargs):
1682 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1685 def encode_dict(d, encoding='utf-8'):
1687 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1688 return dict((encode(k), encode(v)) for k, v in d.items())
1700 def parse_age_limit(s):
1703 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1704 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1707 def strip_jsonp(code):
1709 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1712 def js_to_json(code):
1715 if v in ('true', 'false', 'null'):
1717 if v.startswith('"'):
1718 v = re.sub(r"\\'", "'", v[1:-1])
1719 elif v.startswith("'"):
1721 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1728 res = re.sub(r'''(?x)
1729 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1730 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1731 [a-zA-Z_][.a-zA-Z_0-9]*
1733 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1737 def qualities(quality_ids):
1738 """ Get a numeric quality value out of a list of possible values """
1741 return quality_ids.index(qid)
1747 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1750 def limit_length(s, length):
1751 """ Add ellipses to overly long strings """
1756 return s[:length - len(ELLIPSES)] + ELLIPSES
1760 def version_tuple(v):
1761 return tuple(int(e) for e in re.split(r'[-.]', v))
1764 def is_outdated_version(version, limit, assume_new=True):
1766 return not assume_new
1768 return version_tuple(version) < version_tuple(limit)
1770 return not assume_new
1773 def ytdl_is_updateable():
1774 """ Returns if youtube-dl can be updated with -U """
1775 from zipimport import zipimporter
1777 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1780 def args_to_str(args):
1781 # Get a short string representation for a subprocess command
1782 return ' '.join(shlex_quote(a) for a in args)
1785 def mimetype2ext(mt):
1786 _, _, res = mt.rpartition('/')
1790 'x-mp4-fragmented': 'mp4',
1795 def urlhandle_detect_ext(url_handle):
1798 getheader = lambda h: url_handle.headers[h]
1799 except AttributeError: # Python < 3
1800 getheader = url_handle.info().getheader
1802 cd = getheader('Content-Disposition')
1804 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1806 e = determine_ext(m.group('filename'), default_ext=None)
1810 return mimetype2ext(getheader('Content-Type'))
1813 def encode_data_uri(data, mime_type):
1814 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1817 def age_restricted(content_limit, age_limit):
1818 """ Returns True iff the content should be blocked """
1820 if age_limit is None: # No limit set
1822 if content_limit is None:
1823 return False # Content available for everyone
1824 return age_limit < content_limit
1827 def is_html(first_bytes):
1828 """ Detect whether a file contains HTML by examining its first bytes. """
1831 (b'\xef\xbb\xbf', 'utf-8'),
1832 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1833 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1834 (b'\xff\xfe', 'utf-16-le'),
1835 (b'\xfe\xff', 'utf-16-be'),
1837 for bom, enc in BOMS:
1838 if first_bytes.startswith(bom):
1839 s = first_bytes[len(bom):].decode(enc, 'replace')
1842 s = first_bytes.decode('utf-8', 'replace')
1844 return re.match(r'^\s*<', s)
1847 def determine_protocol(info_dict):
1848 protocol = info_dict.get('protocol')
1849 if protocol is not None:
1852 url = info_dict['url']
1853 if url.startswith('rtmp'):
1855 elif url.startswith('mms'):
1857 elif url.startswith('rtsp'):
1860 ext = determine_ext(url)
1866 return compat_urllib_parse_urlparse(url).scheme
1869 def render_table(header_row, data):
1870 """ Render a list of rows, each as a list of values """
1871 table = [header_row] + data
1872 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1873 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1874 return '\n'.join(format_str % tuple(row) for row in table)
1877 def _match_one(filter_part, dct):
1878 COMPARISON_OPERATORS = {
1886 operator_rex = re.compile(r'''(?x)\s*
1888 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1890 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1891 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1894 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1895 m = operator_rex.search(filter_part)
1897 op = COMPARISON_OPERATORS[m.group('op')]
1898 if m.group('strval') is not None:
1899 if m.group('op') not in ('=', '!='):
1901 'Operator %s does not support string values!' % m.group('op'))
1902 comparison_value = m.group('strval')
1905 comparison_value = int(m.group('intval'))
1907 comparison_value = parse_filesize(m.group('intval'))
1908 if comparison_value is None:
1909 comparison_value = parse_filesize(m.group('intval') + 'B')
1910 if comparison_value is None:
1912 'Invalid integer value %r in filter part %r' % (
1913 m.group('intval'), filter_part))
1914 actual_value = dct.get(m.group('key'))
1915 if actual_value is None:
1916 return m.group('none_inclusive')
1917 return op(actual_value, comparison_value)
1920 '': lambda v: v is not None,
1921 '!': lambda v: v is None,
1923 operator_rex = re.compile(r'''(?x)\s*
1924 (?P<op>%s)\s*(?P<key>[a-z_]+)
1926 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1927 m = operator_rex.search(filter_part)
1929 op = UNARY_OPERATORS[m.group('op')]
1930 actual_value = dct.get(m.group('key'))
1931 return op(actual_value)
1933 raise ValueError('Invalid filter part %r' % filter_part)
1936 def match_str(filter_str, dct):
1937 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1940 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1943 def match_filter_func(filter_str):
1944 def _match_func(info_dict):
1945 if match_str(filter_str, info_dict):
1948 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1949 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1953 def parse_dfxp_time_expr(time_expr):
1957 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1959 return float(mobj.group('time_offset'))
1961 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1963 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1966 def srt_subtitles_timecode(seconds):
1967 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1970 def dfxp2srt(dfxp_data):
1971 _x = functools.partial(xpath_with_ns, ns_map={
1972 'ttml': 'http://www.w3.org/ns/ttml',
1973 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1976 def parse_node(node):
1977 str_or_empty = functools.partial(str_or_none, default='')
1979 out = str_or_empty(node.text)
1982 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1983 out += '\n' + str_or_empty(child.tail)
1984 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1985 out += str_or_empty(parse_node(child))
1987 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1991 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
1993 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1996 raise ValueError('Invalid dfxp/TTML subtitle')
1998 for para, index in zip(paras, itertools.count(1)):
1999 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
2000 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2002 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
2003 out.append('%d\n%s --> %s\n%s\n\n' % (
2005 srt_subtitles_timecode(begin_time),
2006 srt_subtitles_timecode(end_time),
2012 def cli_option(params, command_option, param):
2013 param = params.get(param)
2014 return [command_option, param] if param is not None else []
2017 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2018 param = params.get(param)
2019 assert isinstance(param, bool)
2021 return [command_option + separator + (true_value if param else false_value)]
2022 return [command_option, true_value if param else false_value]
2025 def cli_valueless_option(params, command_option, param, expected_value=True):
2026 param = params.get(param)
2027 return [command_option] if param == expected_value else []
2030 def cli_configuration_args(params, param, default=[]):
2031 ex_args = params.get(param)
2034 assert isinstance(ex_args, list)
2038 class ISO639Utils(object):
2039 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2228 def short2long(cls, code):
2229 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2230 return cls._lang_map.get(code[:2])
2233 def long2short(cls, code):
2234 """Convert language code from ISO 639-2/T to ISO 639-1"""
2235 for short_name, long_name in cls._lang_map.items():
2236 if long_name == code:
2240 class ISO3166Utils(object):
2241 # From http://data.okfn.org/data/core/country-list
2243 'AF': 'Afghanistan',
2244 'AX': 'Ã…land Islands',
2247 'AS': 'American Samoa',
2252 'AG': 'Antigua and Barbuda',
2269 'BO': 'Bolivia, Plurinational State of',
2270 'BQ': 'Bonaire, Sint Eustatius and Saba',
2271 'BA': 'Bosnia and Herzegovina',
2273 'BV': 'Bouvet Island',
2275 'IO': 'British Indian Ocean Territory',
2276 'BN': 'Brunei Darussalam',
2278 'BF': 'Burkina Faso',
2284 'KY': 'Cayman Islands',
2285 'CF': 'Central African Republic',
2289 'CX': 'Christmas Island',
2290 'CC': 'Cocos (Keeling) Islands',
2294 'CD': 'Congo, the Democratic Republic of the',
2295 'CK': 'Cook Islands',
2297 'CI': 'Côte d\'Ivoire',
2302 'CZ': 'Czech Republic',
2306 'DO': 'Dominican Republic',
2309 'SV': 'El Salvador',
2310 'GQ': 'Equatorial Guinea',
2314 'FK': 'Falkland Islands (Malvinas)',
2315 'FO': 'Faroe Islands',
2319 'GF': 'French Guiana',
2320 'PF': 'French Polynesia',
2321 'TF': 'French Southern Territories',
2336 'GW': 'Guinea-Bissau',
2339 'HM': 'Heard Island and McDonald Islands',
2340 'VA': 'Holy See (Vatican City State)',
2347 'IR': 'Iran, Islamic Republic of',
2350 'IM': 'Isle of Man',
2360 'KP': 'Korea, Democratic People\'s Republic of',
2361 'KR': 'Korea, Republic of',
2364 'LA': 'Lao People\'s Democratic Republic',
2370 'LI': 'Liechtenstein',
2374 'MK': 'Macedonia, the Former Yugoslav Republic of',
2381 'MH': 'Marshall Islands',
2387 'FM': 'Micronesia, Federated States of',
2388 'MD': 'Moldova, Republic of',
2399 'NL': 'Netherlands',
2400 'NC': 'New Caledonia',
2401 'NZ': 'New Zealand',
2406 'NF': 'Norfolk Island',
2407 'MP': 'Northern Mariana Islands',
2412 'PS': 'Palestine, State of',
2414 'PG': 'Papua New Guinea',
2417 'PH': 'Philippines',
2421 'PR': 'Puerto Rico',
2425 'RU': 'Russian Federation',
2427 'BL': 'Saint Barthélemy',
2428 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2429 'KN': 'Saint Kitts and Nevis',
2430 'LC': 'Saint Lucia',
2431 'MF': 'Saint Martin (French part)',
2432 'PM': 'Saint Pierre and Miquelon',
2433 'VC': 'Saint Vincent and the Grenadines',
2436 'ST': 'Sao Tome and Principe',
2437 'SA': 'Saudi Arabia',
2441 'SL': 'Sierra Leone',
2443 'SX': 'Sint Maarten (Dutch part)',
2446 'SB': 'Solomon Islands',
2448 'ZA': 'South Africa',
2449 'GS': 'South Georgia and the South Sandwich Islands',
2450 'SS': 'South Sudan',
2455 'SJ': 'Svalbard and Jan Mayen',
2458 'CH': 'Switzerland',
2459 'SY': 'Syrian Arab Republic',
2460 'TW': 'Taiwan, Province of China',
2462 'TZ': 'Tanzania, United Republic of',
2464 'TL': 'Timor-Leste',
2468 'TT': 'Trinidad and Tobago',
2471 'TM': 'Turkmenistan',
2472 'TC': 'Turks and Caicos Islands',
2476 'AE': 'United Arab Emirates',
2477 'GB': 'United Kingdom',
2478 'US': 'United States',
2479 'UM': 'United States Minor Outlying Islands',
2483 'VE': 'Venezuela, Bolivarian Republic of',
2485 'VG': 'Virgin Islands, British',
2486 'VI': 'Virgin Islands, U.S.',
2487 'WF': 'Wallis and Futuna',
2488 'EH': 'Western Sahara',
2495 def short2full(cls, code):
2496 """Convert an ISO 3166-2 country code to the corresponding full name"""
2497 return cls._country_map.get(code.upper())
2500 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2501 def __init__(self, proxies=None):
2502 # Set default handlers
2503 for type in ('http', 'https'):
2504 setattr(self, '%s_open' % type,
2505 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2506 meth(r, proxy, type))
2507 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2509 def proxy_open(self, req, proxy, type):
2510 req_proxy = req.headers.get('Ytdl-request-proxy')
2511 if req_proxy is not None:
2513 del req.headers['Ytdl-request-proxy']
2515 if proxy == '__noproxy__':
2516 return None # No Proxy
2517 return compat_urllib_request.ProxyHandler.proxy_open(
2518 self, req, proxy, type)