2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
40 compat_etree_fromstring,
45 compat_socket_create_connection,
49 compat_urllib_parse_urlparse,
50 compat_urllib_request,
56 # This is not clearly defined otherwise
57 compiled_regex_type = type(re.compile(''))
60 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
61 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
62 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
63 'Accept-Encoding': 'gzip, deflate',
64 'Accept-Language': 'en-us,en;q=0.5',
70 ENGLISH_MONTH_NAMES = [
71 'January', 'February', 'March', 'April', 'May', 'June',
72 'July', 'August', 'September', 'October', 'November', 'December']
75 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
76 'flv', 'f4v', 'f4a', 'f4b',
77 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
87 'f4f', 'f4m', 'm3u8', 'smil')
90 def preferredencoding():
91 """Get preferred encoding.
93 Returns the best encoding scheme for the system, based on
94 locale.getpreferredencoding() and some further tweaks.
97 pref = locale.getpreferredencoding()
105 def write_json_file(obj, fn):
106 """ Encode obj as JSON and write it to fn, atomically if possible """
108 fn = encodeFilename(fn)
109 if sys.version_info < (3, 0) and sys.platform != 'win32':
110 encoding = get_filesystem_encoding()
111 # os.path.basename returns a bytes object, but NamedTemporaryFile
112 # will fail if the filename contains non ascii characters unless we
113 # use a unicode object
114 path_basename = lambda f: os.path.basename(fn).decode(encoding)
115 # the same for os.path.dirname
116 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
118 path_basename = os.path.basename
119 path_dirname = os.path.dirname
123 'prefix': path_basename(fn) + '.',
124 'dir': path_dirname(fn),
128 # In Python 2.x, json.dump expects a bytestream.
129 # In Python 3.x, it writes to a character stream
130 if sys.version_info < (3, 0):
138 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
143 if sys.platform == 'win32':
144 # Need to remove existing file on Windows, else os.rename raises
145 # WindowsError or FileExistsError.
150 os.rename(tf.name, fn)
159 if sys.version_info >= (2, 7):
160 def find_xpath_attr(node, xpath, key, val=None):
161 """ Find the xpath xpath[@key=val] """
162 assert re.match(r'^[a-zA-Z_-]+$', key)
164 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
165 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
166 return node.find(expr)
168 def find_xpath_attr(node, xpath, key, val=None):
169 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
170 # .//node does not match if a node is a direct child of . !
171 if isinstance(xpath, compat_str):
172 xpath = xpath.encode('ascii')
174 for f in node.findall(xpath):
175 if key not in f.attrib:
177 if val is None or f.attrib.get(key) == val:
181 # On python2.6 the xml.etree.ElementTree.Element methods don't support
182 # the namespace parameter
185 def xpath_with_ns(path, ns_map):
186 components = [c.split(':') for c in path.split('/')]
190 replaced.append(c[0])
193 replaced.append('{%s}%s' % (ns_map[ns], tag))
194 return '/'.join(replaced)
197 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
198 def _find_xpath(xpath):
199 if sys.version_info < (2, 7): # Crazy 2.6
200 xpath = xpath.encode('ascii')
201 return node.find(xpath)
203 if isinstance(xpath, (str, compat_str)):
204 n = _find_xpath(xpath)
212 if default is not NO_DEFAULT:
215 name = xpath if name is None else name
216 raise ExtractorError('Could not find XML element %s' % name)
222 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
223 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
224 if n is None or n == default:
227 if default is not NO_DEFAULT:
230 name = xpath if name is None else name
231 raise ExtractorError('Could not find XML element\'s text %s' % name)
237 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
238 n = find_xpath_attr(node, xpath, key)
240 if default is not NO_DEFAULT:
243 name = '%s[@%s]' % (xpath, key) if name is None else name
244 raise ExtractorError('Could not find XML attribute %s' % name)
250 def get_element_by_id(id, html):
251 """Return the content of the tag with the specified ID in the passed HTML document"""
252 return get_element_by_attribute('id', id, html)
255 def get_element_by_attribute(attribute, value, html):
256 """Return the content of the tag with the specified attribute in the passed HTML document"""
258 m = re.search(r'''(?xs)
260 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
262 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
266 ''' % (re.escape(attribute), re.escape(value)), html)
270 res = m.group('content')
272 if res.startswith('"') or res.startswith("'"):
275 return unescapeHTML(res)
278 def clean_html(html):
279 """Clean an HTML snippet into a readable string"""
281 if html is None: # Convenience for sanitizing descriptions etc.
285 html = html.replace('\n', ' ')
286 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
287 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
289 html = re.sub('<.*?>', '', html)
290 # Replace html entities
291 html = unescapeHTML(html)
295 def sanitize_open(filename, open_mode):
296 """Try to open the given filename, and slightly tweak it if this fails.
298 Attempts to open the given filename. If this fails, it tries to change
299 the filename slightly, step by step, until it's either able to open it
300 or it fails and raises a final exception, like the standard open()
303 It returns the tuple (stream, definitive_file_name).
307 if sys.platform == 'win32':
309 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
310 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
311 stream = open(encodeFilename(filename), open_mode)
312 return (stream, filename)
313 except (IOError, OSError) as err:
314 if err.errno in (errno.EACCES,):
317 # In case of error, try to remove win32 forbidden chars
318 alt_filename = sanitize_path(filename)
319 if alt_filename == filename:
322 # An exception here should be caught in the caller
323 stream = open(encodeFilename(alt_filename), open_mode)
324 return (stream, alt_filename)
327 def timeconvert(timestr):
328 """Convert RFC 2822 defined time string into system timestamp"""
330 timetuple = email.utils.parsedate_tz(timestr)
331 if timetuple is not None:
332 timestamp = email.utils.mktime_tz(timetuple)
336 def sanitize_filename(s, restricted=False, is_id=False):
337 """Sanitizes a string so it could be used as part of a filename.
338 If restricted is set, use a stricter subset of allowed characters.
339 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
341 def replace_insane(char):
342 if char == '?' or ord(char) < 32 or ord(char) == 127:
345 return '' if restricted else '\''
347 return '_-' if restricted else ' -'
348 elif char in '\\/|*<>':
350 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
352 if restricted and ord(char) > 127:
357 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
358 result = ''.join(map(replace_insane, s))
360 while '__' in result:
361 result = result.replace('__', '_')
362 result = result.strip('_')
363 # Common case of "Foreign band name - English song title"
364 if restricted and result.startswith('-_'):
366 if result.startswith('-'):
367 result = '_' + result[len('-'):]
368 result = result.lstrip('.')
374 def sanitize_path(s):
375 """Sanitizes and normalizes path on Windows"""
376 if sys.platform != 'win32':
378 drive_or_unc, _ = os.path.splitdrive(s)
379 if sys.version_info < (2, 7) and not drive_or_unc:
380 drive_or_unc, _ = os.path.splitunc(s)
381 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
385 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
386 for path_part in norm_path]
388 sanitized_path.insert(0, drive_or_unc + os.path.sep)
389 return os.path.join(*sanitized_path)
392 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
393 # unwanted failures due to missing protocol
394 def sanitized_Request(url, *args, **kwargs):
395 return compat_urllib_request.Request(
396 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
399 def orderedSet(iterable):
400 """ Remove all duplicates from the input iterable """
408 def _htmlentity_transform(entity):
409 """Transforms an HTML entity to a character."""
410 # Known non-numeric HTML entity
411 if entity in compat_html_entities.name2codepoint:
412 return compat_chr(compat_html_entities.name2codepoint[entity])
414 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
416 numstr = mobj.group(1)
417 if numstr.startswith('x'):
419 numstr = '0%s' % numstr
422 # See https://github.com/rg3/youtube-dl/issues/7518
424 return compat_chr(int(numstr, base))
428 # Unknown entity in name, return its literal representation
429 return '&%s;' % entity
435 assert type(s) == compat_str
438 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
441 def get_subprocess_encoding():
442 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
443 # For subprocess calls, encode with locale encoding
444 # Refer to http://stackoverflow.com/a/9951851/35070
445 encoding = preferredencoding()
447 encoding = sys.getfilesystemencoding()
453 def encodeFilename(s, for_subprocess=False):
455 @param s The name of the file
458 assert type(s) == compat_str
460 # Python 3 has a Unicode API
461 if sys.version_info >= (3, 0):
464 # Pass '' directly to use Unicode APIs on Windows 2000 and up
465 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
466 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
467 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
470 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
471 if sys.platform.startswith('java'):
474 return s.encode(get_subprocess_encoding(), 'ignore')
477 def decodeFilename(b, for_subprocess=False):
479 if sys.version_info >= (3, 0):
482 if not isinstance(b, bytes):
485 return b.decode(get_subprocess_encoding(), 'ignore')
488 def encodeArgument(s):
489 if not isinstance(s, compat_str):
490 # Legacy code that uses byte strings
491 # Uncomment the following line after fixing all post processors
492 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
493 s = s.decode('ascii')
494 return encodeFilename(s, True)
497 def decodeArgument(b):
498 return decodeFilename(b, True)
501 def decodeOption(optval):
504 if isinstance(optval, bytes):
505 optval = optval.decode(preferredencoding())
507 assert isinstance(optval, compat_str)
511 def formatSeconds(secs):
513 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
515 return '%d:%02d' % (secs // 60, secs % 60)
520 def make_HTTPS_handler(params, **kwargs):
521 opts_no_check_certificate = params.get('nocheckcertificate', False)
522 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
523 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
524 if opts_no_check_certificate:
525 context.check_hostname = False
526 context.verify_mode = ssl.CERT_NONE
528 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
531 # (create_default_context present but HTTPSHandler has no context=)
534 if sys.version_info < (3, 2):
535 return YoutubeDLHTTPSHandler(params, **kwargs)
537 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
538 context.verify_mode = (ssl.CERT_NONE
539 if opts_no_check_certificate
540 else ssl.CERT_REQUIRED)
541 context.set_default_verify_paths()
542 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
545 def bug_reports_message():
546 if ytdl_is_updateable():
547 update_cmd = 'type youtube-dl -U to update'
549 update_cmd = 'see https://yt-dl.org/update on how to update'
550 msg = '; please report this issue on https://yt-dl.org/bug .'
551 msg += ' Make sure you are using the latest version; %s.' % update_cmd
552 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
556 class ExtractorError(Exception):
557 """Error during info extraction."""
559 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
560 """ tb, if given, is the original traceback (so that it can be printed out).
561 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
564 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
566 if video_id is not None:
567 msg = video_id + ': ' + msg
569 msg += ' (caused by %r)' % cause
571 msg += bug_reports_message()
572 super(ExtractorError, self).__init__(msg)
575 self.exc_info = sys.exc_info() # preserve original exception
577 self.video_id = video_id
579 def format_traceback(self):
580 if self.traceback is None:
582 return ''.join(traceback.format_tb(self.traceback))
585 class UnsupportedError(ExtractorError):
586 def __init__(self, url):
587 super(UnsupportedError, self).__init__(
588 'Unsupported URL: %s' % url, expected=True)
592 class RegexNotFoundError(ExtractorError):
593 """Error when a regex didn't match"""
597 class DownloadError(Exception):
598 """Download Error exception.
600 This exception may be thrown by FileDownloader objects if they are not
601 configured to continue on errors. They will contain the appropriate
605 def __init__(self, msg, exc_info=None):
606 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
607 super(DownloadError, self).__init__(msg)
608 self.exc_info = exc_info
611 class SameFileError(Exception):
612 """Same File exception.
614 This exception will be thrown by FileDownloader objects if they detect
615 multiple files would have to be downloaded to the same file on disk.
620 class PostProcessingError(Exception):
621 """Post Processing exception.
623 This exception may be raised by PostProcessor's .run() method to
624 indicate an error in the postprocessing task.
627 def __init__(self, msg):
631 class MaxDownloadsReached(Exception):
632 """ --max-downloads limit has been reached. """
636 class UnavailableVideoError(Exception):
637 """Unavailable Format exception.
639 This exception will be thrown when a video is requested
640 in a format that is not available for that video.
645 class ContentTooShortError(Exception):
646 """Content Too Short exception.
648 This exception may be raised by FileDownloader objects when a file they
649 download is too small for what the server announced first, indicating
650 the connection was probably interrupted.
653 def __init__(self, downloaded, expected):
655 self.downloaded = downloaded
656 self.expected = expected
659 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
660 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
661 # expected HTTP responses to meet HTTP/1.0 or later (see also
662 # https://github.com/rg3/youtube-dl/issues/6727)
663 if sys.version_info < (3, 0):
664 kwargs[b'strict'] = True
665 hc = http_class(*args, **kwargs)
666 source_address = ydl_handler._params.get('source_address')
667 if source_address is not None:
668 sa = (source_address, 0)
669 if hasattr(hc, 'source_address'): # Python 2.7+
670 hc.source_address = sa
672 def _hc_connect(self, *args, **kwargs):
673 sock = compat_socket_create_connection(
674 (self.host, self.port), self.timeout, sa)
676 self.sock = ssl.wrap_socket(
677 sock, self.key_file, self.cert_file,
678 ssl_version=ssl.PROTOCOL_TLSv1)
681 hc.connect = functools.partial(_hc_connect, hc)
686 def handle_youtubedl_headers(headers):
687 filtered_headers = headers
689 if 'Youtubedl-no-compression' in filtered_headers:
690 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
691 del filtered_headers['Youtubedl-no-compression']
693 return filtered_headers
696 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
697 """Handler for HTTP requests and responses.
699 This class, when installed with an OpenerDirector, automatically adds
700 the standard headers to every HTTP request and handles gzipped and
701 deflated responses from web servers. If compression is to be avoided in
702 a particular request, the original request in the program code only has
703 to include the HTTP header "Youtubedl-no-compression", which will be
704 removed before making the real request.
706 Part of this code was copied from:
708 http://techknack.net/python-urllib2-handlers/
710 Andrew Rowls, the author of that code, agreed to release it to the
714 def __init__(self, params, *args, **kwargs):
715 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
716 self._params = params
718 def http_open(self, req):
719 return self.do_open(functools.partial(
720 _create_http_connection, self, compat_http_client.HTTPConnection, False),
726 return zlib.decompress(data, -zlib.MAX_WBITS)
728 return zlib.decompress(data)
731 def addinfourl_wrapper(stream, headers, url, code):
732 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
733 return compat_urllib_request.addinfourl(stream, headers, url, code)
734 ret = compat_urllib_request.addinfourl(stream, headers, url)
738 def http_request(self, req):
739 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
740 # always respected by websites, some tend to give out URLs with non percent-encoded
741 # non-ASCII characters (see telemb.py, ard.py [#3412])
742 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
743 # To work around aforementioned issue we will replace request's original URL with
744 # percent-encoded one
745 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
746 # the code of this workaround has been moved here from YoutubeDL.urlopen()
747 url = req.get_full_url()
748 url_escaped = escape_url(url)
750 # Substitute URL if any change after escaping
751 if url != url_escaped:
752 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
754 url_escaped, data=req.data, headers=req.headers,
755 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
756 new_req.timeout = req.timeout
759 for h, v in std_headers.items():
760 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
761 # The dict keys are capitalized because of this bug by urllib
762 if h.capitalize() not in req.headers:
765 req.headers = handle_youtubedl_headers(req.headers)
767 if sys.version_info < (2, 7) and '#' in req.get_full_url():
768 # Python 2.6 is brain-dead when it comes to fragments
769 req._Request__original = req._Request__original.partition('#')[0]
770 req._Request__r_type = req._Request__r_type.partition('#')[0]
774 def http_response(self, req, resp):
777 if resp.headers.get('Content-encoding', '') == 'gzip':
778 content = resp.read()
779 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
781 uncompressed = io.BytesIO(gz.read())
782 except IOError as original_ioerror:
783 # There may be junk add the end of the file
784 # See http://stackoverflow.com/q/4928560/35070 for details
785 for i in range(1, 1024):
787 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
788 uncompressed = io.BytesIO(gz.read())
793 raise original_ioerror
794 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
795 resp.msg = old_resp.msg
796 del resp.headers['Content-encoding']
798 if resp.headers.get('Content-encoding', '') == 'deflate':
799 gz = io.BytesIO(self.deflate(resp.read()))
800 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
801 resp.msg = old_resp.msg
802 del resp.headers['Content-encoding']
803 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
804 # https://github.com/rg3/youtube-dl/issues/6457).
805 if 300 <= resp.code < 400:
806 location = resp.headers.get('Location')
808 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
809 if sys.version_info >= (3, 0):
810 location = location.encode('iso-8859-1').decode('utf-8')
811 location_escaped = escape_url(location)
812 if location != location_escaped:
813 del resp.headers['Location']
814 resp.headers['Location'] = location_escaped
817 https_request = http_request
818 https_response = http_response
821 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
822 def __init__(self, params, https_conn_class=None, *args, **kwargs):
823 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
824 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
825 self._params = params
827 def https_open(self, req):
829 if hasattr(self, '_context'): # python > 2.6
830 kwargs['context'] = self._context
831 if hasattr(self, '_check_hostname'): # python 3.x
832 kwargs['check_hostname'] = self._check_hostname
833 return self.do_open(functools.partial(
834 _create_http_connection, self, self._https_conn_class, True),
838 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
839 def __init__(self, cookiejar=None):
840 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
842 def http_response(self, request, response):
843 # Python 2 will choke on next HTTP request in row if there are non-ASCII
844 # characters in Set-Cookie HTTP header of last response (see
845 # https://github.com/rg3/youtube-dl/issues/6769).
846 # In order to at least prevent crashing we will percent encode Set-Cookie
847 # header before HTTPCookieProcessor starts processing it.
848 # if sys.version_info < (3, 0) and response.headers:
849 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
850 # set_cookie = response.headers.get(set_cookie_header)
852 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
853 # if set_cookie != set_cookie_escaped:
854 # del response.headers[set_cookie_header]
855 # response.headers[set_cookie_header] = set_cookie_escaped
856 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
858 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
859 https_response = http_response
862 def parse_iso8601(date_str, delimiter='T', timezone=None):
863 """ Return a UNIX timestamp from the given date """
868 date_str = re.sub(r'\.[0-9]+', '', date_str)
872 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
875 timezone = datetime.timedelta()
877 date_str = date_str[:-len(m.group(0))]
878 if not m.group('sign'):
879 timezone = datetime.timedelta()
881 sign = 1 if m.group('sign') == '+' else -1
882 timezone = datetime.timedelta(
883 hours=sign * int(m.group('hours')),
884 minutes=sign * int(m.group('minutes')))
886 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
887 dt = datetime.datetime.strptime(date_str, date_format) - timezone
888 return calendar.timegm(dt.timetuple())
893 def unified_strdate(date_str, day_first=True):
894 """Return a string with the date in the format YYYYMMDD"""
900 date_str = date_str.replace(',', ' ')
901 # %z (UTC offset) is only supported in python>=3.2
902 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
903 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
904 # Remove AM/PM + timezone
905 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
907 format_expressions = [
912 '%b %dst %Y %I:%M%p',
913 '%b %dnd %Y %I:%M%p',
914 '%b %dth %Y %I:%M%p',
920 '%Y-%m-%d %H:%M:%S.%f',
923 '%Y-%m-%dT%H:%M:%SZ',
924 '%Y-%m-%dT%H:%M:%S.%fZ',
925 '%Y-%m-%dT%H:%M:%S.%f0Z',
927 '%Y-%m-%dT%H:%M:%S.%f',
931 format_expressions.extend([
939 format_expressions.extend([
946 for expression in format_expressions:
948 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
951 if upload_date is None:
952 timetuple = email.utils.parsedate_tz(date_str)
954 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
955 if upload_date is not None:
956 return compat_str(upload_date)
959 def determine_ext(url, default_ext='unknown_video'):
962 guess = url.partition('?')[0].rpartition('.')[2]
963 if re.match(r'^[A-Za-z0-9]+$', guess):
965 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
966 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
967 return guess.rstrip('/')
972 def subtitles_filename(filename, sub_lang, sub_format):
973 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
976 def date_from_str(date_str):
978 Return a datetime object from a string in the format YYYYMMDD or
979 (now|today)[+-][0-9](day|week|month|year)(s)?"""
980 today = datetime.date.today()
981 if date_str in ('now', 'today'):
983 if date_str == 'yesterday':
984 return today - datetime.timedelta(days=1)
985 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
986 if match is not None:
987 sign = match.group('sign')
988 time = int(match.group('time'))
991 unit = match.group('unit')
992 # A bad approximation?
1000 delta = datetime.timedelta(**{unit: time})
1001 return today + delta
1002 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1005 def hyphenate_date(date_str):
1007 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1008 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1009 if match is not None:
1010 return '-'.join(match.groups())
1015 class DateRange(object):
1016 """Represents a time interval between two dates"""
1018 def __init__(self, start=None, end=None):
1019 """start and end must be strings in the format accepted by date"""
1020 if start is not None:
1021 self.start = date_from_str(start)
1023 self.start = datetime.datetime.min.date()
1025 self.end = date_from_str(end)
1027 self.end = datetime.datetime.max.date()
1028 if self.start > self.end:
1029 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1033 """Returns a range that only contains the given day"""
1034 return cls(day, day)
1036 def __contains__(self, date):
1037 """Check if the date is in the range"""
1038 if not isinstance(date, datetime.date):
1039 date = date_from_str(date)
1040 return self.start <= date <= self.end
1043 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1046 def platform_name():
1047 """ Returns the platform name as a compat_str """
1048 res = platform.platform()
1049 if isinstance(res, bytes):
1050 res = res.decode(preferredencoding())
1052 assert isinstance(res, compat_str)
1056 def _windows_write_string(s, out):
1057 """ Returns True if the string was written using special methods,
1058 False if it has yet to be written out."""
1059 # Adapted from http://stackoverflow.com/a/3259271/35070
1062 import ctypes.wintypes
1070 fileno = out.fileno()
1071 except AttributeError:
1072 # If the output stream doesn't have a fileno, it's virtual
1074 except io.UnsupportedOperation:
1075 # Some strange Windows pseudo files?
1077 if fileno not in WIN_OUTPUT_IDS:
1080 GetStdHandle = ctypes.WINFUNCTYPE(
1081 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1082 (b'GetStdHandle', ctypes.windll.kernel32))
1083 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1085 WriteConsoleW = ctypes.WINFUNCTYPE(
1086 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1087 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1088 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1089 written = ctypes.wintypes.DWORD(0)
1091 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1092 FILE_TYPE_CHAR = 0x0002
1093 FILE_TYPE_REMOTE = 0x8000
1094 GetConsoleMode = ctypes.WINFUNCTYPE(
1095 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1096 ctypes.POINTER(ctypes.wintypes.DWORD))(
1097 (b'GetConsoleMode', ctypes.windll.kernel32))
1098 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1100 def not_a_console(handle):
1101 if handle == INVALID_HANDLE_VALUE or handle is None:
1103 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1104 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1106 if not_a_console(h):
1109 def next_nonbmp_pos(s):
1111 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1112 except StopIteration:
1116 count = min(next_nonbmp_pos(s), 1024)
1118 ret = WriteConsoleW(
1119 h, s, count if count else 2, ctypes.byref(written), None)
1121 raise OSError('Failed to write string')
1122 if not count: # We just wrote a non-BMP character
1123 assert written.value == 2
1126 assert written.value > 0
1127 s = s[written.value:]
1131 def write_string(s, out=None, encoding=None):
1134 assert type(s) == compat_str
1136 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1137 if _windows_write_string(s, out):
1140 if ('b' in getattr(out, 'mode', '') or
1141 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1142 byt = s.encode(encoding or preferredencoding(), 'ignore')
1144 elif hasattr(out, 'buffer'):
1145 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1146 byt = s.encode(enc, 'ignore')
1147 out.buffer.write(byt)
1153 def bytes_to_intlist(bs):
1156 if isinstance(bs[0], int): # Python 3
1159 return [ord(c) for c in bs]
1162 def intlist_to_bytes(xs):
1165 return struct_pack('%dB' % len(xs), *xs)
1168 # Cross-platform file locking
1169 if sys.platform == 'win32':
1170 import ctypes.wintypes
1173 class OVERLAPPED(ctypes.Structure):
1175 ('Internal', ctypes.wintypes.LPVOID),
1176 ('InternalHigh', ctypes.wintypes.LPVOID),
1177 ('Offset', ctypes.wintypes.DWORD),
1178 ('OffsetHigh', ctypes.wintypes.DWORD),
1179 ('hEvent', ctypes.wintypes.HANDLE),
1182 kernel32 = ctypes.windll.kernel32
1183 LockFileEx = kernel32.LockFileEx
1184 LockFileEx.argtypes = [
1185 ctypes.wintypes.HANDLE, # hFile
1186 ctypes.wintypes.DWORD, # dwFlags
1187 ctypes.wintypes.DWORD, # dwReserved
1188 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1189 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1190 ctypes.POINTER(OVERLAPPED) # Overlapped
1192 LockFileEx.restype = ctypes.wintypes.BOOL
1193 UnlockFileEx = kernel32.UnlockFileEx
1194 UnlockFileEx.argtypes = [
1195 ctypes.wintypes.HANDLE, # hFile
1196 ctypes.wintypes.DWORD, # dwReserved
1197 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1198 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1199 ctypes.POINTER(OVERLAPPED) # Overlapped
1201 UnlockFileEx.restype = ctypes.wintypes.BOOL
1202 whole_low = 0xffffffff
1203 whole_high = 0x7fffffff
1205 def _lock_file(f, exclusive):
1206 overlapped = OVERLAPPED()
1207 overlapped.Offset = 0
1208 overlapped.OffsetHigh = 0
1209 overlapped.hEvent = 0
1210 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1211 handle = msvcrt.get_osfhandle(f.fileno())
1212 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1213 whole_low, whole_high, f._lock_file_overlapped_p):
1214 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1216 def _unlock_file(f):
1217 assert f._lock_file_overlapped_p
1218 handle = msvcrt.get_osfhandle(f.fileno())
1219 if not UnlockFileEx(handle, 0,
1220 whole_low, whole_high, f._lock_file_overlapped_p):
1221 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1224 # Some platforms, such as Jython, is missing fcntl
1228 def _lock_file(f, exclusive):
1229 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1231 def _unlock_file(f):
1232 fcntl.flock(f, fcntl.LOCK_UN)
1234 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1236 def _lock_file(f, exclusive):
1237 raise IOError(UNSUPPORTED_MSG)
1239 def _unlock_file(f):
1240 raise IOError(UNSUPPORTED_MSG)
1243 class locked_file(object):
1244 def __init__(self, filename, mode, encoding=None):
1245 assert mode in ['r', 'a', 'w']
1246 self.f = io.open(filename, mode, encoding=encoding)
1249 def __enter__(self):
1250 exclusive = self.mode != 'r'
1252 _lock_file(self.f, exclusive)
1258 def __exit__(self, etype, value, traceback):
1260 _unlock_file(self.f)
1267 def write(self, *args):
1268 return self.f.write(*args)
1270 def read(self, *args):
1271 return self.f.read(*args)
1274 def get_filesystem_encoding():
1275 encoding = sys.getfilesystemencoding()
1276 return encoding if encoding is not None else 'utf-8'
1279 def shell_quote(args):
1281 encoding = get_filesystem_encoding()
1283 if isinstance(a, bytes):
1284 # We may get a filename encoded with 'encodeFilename'
1285 a = a.decode(encoding)
1286 quoted_args.append(pipes.quote(a))
1287 return ' '.join(quoted_args)
1290 def smuggle_url(url, data):
1291 """ Pass additional data in a URL for internal use. """
1293 sdata = compat_urllib_parse.urlencode(
1294 {'__youtubedl_smuggle': json.dumps(data)})
1295 return url + '#' + sdata
1298 def unsmuggle_url(smug_url, default=None):
1299 if '#__youtubedl_smuggle' not in smug_url:
1300 return smug_url, default
1301 url, _, sdata = smug_url.rpartition('#')
1302 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1303 data = json.loads(jsond)
1307 def format_bytes(bytes):
1310 if type(bytes) is str:
1311 bytes = float(bytes)
1315 exponent = int(math.log(bytes, 1024.0))
1316 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1317 converted = float(bytes) / float(1024 ** exponent)
1318 return '%.2f%s' % (converted, suffix)
1321 def parse_filesize(s):
1325 # The lower-case forms are of course incorrect and unofficial,
1326 # but we support those too
1364 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1366 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1370 num_str = m.group('num').replace(',', '.')
1371 mult = _UNIT_TABLE[m.group('unit')]
1372 return int(float(num_str) * mult)
1375 def month_by_name(name):
1376 """ Return the number of a month by (locale-independently) English name """
1379 return ENGLISH_MONTH_NAMES.index(name) + 1
1384 def month_by_abbreviation(abbrev):
1385 """ Return the number of a month by (locale-independently) English
1389 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1394 def fix_xml_ampersands(xml_str):
1395 """Replace all the '&' by '&' in XML"""
1397 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1402 def setproctitle(title):
1403 assert isinstance(title, compat_str)
1405 # ctypes in Jython is not complete
1406 # http://bugs.jython.org/issue2148
1407 if sys.platform.startswith('java'):
1411 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1414 title_bytes = title.encode('utf-8')
1415 buf = ctypes.create_string_buffer(len(title_bytes))
1416 buf.value = title_bytes
1418 libc.prctl(15, buf, 0, 0, 0)
1419 except AttributeError:
1420 return # Strange libc, just skip this
1423 def remove_start(s, start):
1424 if s.startswith(start):
1425 return s[len(start):]
1429 def remove_end(s, end):
1431 return s[:-len(end)]
1435 def remove_quotes(s):
1436 if s is None or len(s) < 2:
1438 for quote in ('"', "'", ):
1439 if s[0] == quote and s[-1] == quote:
1444 def url_basename(url):
1445 path = compat_urlparse.urlparse(url).path
1446 return path.strip('/').split('/')[-1]
1449 class HEADRequest(compat_urllib_request.Request):
1450 def get_method(self):
1454 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1457 v = getattr(v, get_attr, None)
1463 return int(v) * invscale // scale
1468 def str_or_none(v, default=None):
1469 return default if v is None else compat_str(v)
1472 def str_to_int(int_str):
1473 """ A more relaxed version of int_or_none """
1476 int_str = re.sub(r'[,\.\+]', '', int_str)
1480 def float_or_none(v, scale=1, invscale=1, default=None):
1484 return float(v) * invscale / scale
1489 def parse_duration(s):
1490 if not isinstance(s, compat_basestring):
1498 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1499 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1501 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1504 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1505 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1507 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1509 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1514 if m.group('only_mins'):
1515 return float_or_none(m.group('only_mins'), invscale=60)
1516 if m.group('only_hours'):
1517 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1519 res += int(m.group('secs'))
1520 if m.group('mins_reversed'):
1521 res += int(m.group('mins_reversed')) * 60
1523 res += int(m.group('mins')) * 60
1524 if m.group('hours'):
1525 res += int(m.group('hours')) * 60 * 60
1526 if m.group('hours_reversed'):
1527 res += int(m.group('hours_reversed')) * 60 * 60
1529 res += int(m.group('days')) * 24 * 60 * 60
1531 res += float(m.group('ms'))
1535 def prepend_extension(filename, ext, expected_real_ext=None):
1536 name, real_ext = os.path.splitext(filename)
1538 '{0}.{1}{2}'.format(name, ext, real_ext)
1539 if not expected_real_ext or real_ext[1:] == expected_real_ext
1540 else '{0}.{1}'.format(filename, ext))
1543 def replace_extension(filename, ext, expected_real_ext=None):
1544 name, real_ext = os.path.splitext(filename)
1545 return '{0}.{1}'.format(
1546 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1550 def check_executable(exe, args=[]):
1551 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1552 args can be a list of arguments for a short output (like -version) """
1554 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1560 def get_exe_version(exe, args=['--version'],
1561 version_re=None, unrecognized='present'):
1562 """ Returns the version of the specified executable,
1563 or False if the executable is not present """
1565 out, _ = subprocess.Popen(
1566 [encodeArgument(exe)] + args,
1567 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1570 if isinstance(out, bytes): # Python 2.x
1571 out = out.decode('ascii', 'ignore')
1572 return detect_exe_version(out, version_re, unrecognized)
1575 def detect_exe_version(output, version_re=None, unrecognized='present'):
1576 assert isinstance(output, compat_str)
1577 if version_re is None:
1578 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1579 m = re.search(version_re, output)
1586 class PagedList(object):
1588 # This is only useful for tests
1589 return len(self.getslice())
1592 class OnDemandPagedList(PagedList):
1593 def __init__(self, pagefunc, pagesize):
1594 self._pagefunc = pagefunc
1595 self._pagesize = pagesize
1597 def getslice(self, start=0, end=None):
1599 for pagenum in itertools.count(start // self._pagesize):
1600 firstid = pagenum * self._pagesize
1601 nextfirstid = pagenum * self._pagesize + self._pagesize
1602 if start >= nextfirstid:
1605 page_results = list(self._pagefunc(pagenum))
1608 start % self._pagesize
1609 if firstid <= start < nextfirstid
1613 ((end - 1) % self._pagesize) + 1
1614 if (end is not None and firstid <= end <= nextfirstid)
1617 if startv != 0 or endv is not None:
1618 page_results = page_results[startv:endv]
1619 res.extend(page_results)
1621 # A little optimization - if current page is not "full", ie. does
1622 # not contain page_size videos then we can assume that this page
1623 # is the last one - there are no more ids on further pages -
1624 # i.e. no need to query again.
1625 if len(page_results) + startv < self._pagesize:
1628 # If we got the whole page, but the next page is not interesting,
1629 # break out early as well
1630 if end == nextfirstid:
1635 class InAdvancePagedList(PagedList):
1636 def __init__(self, pagefunc, pagecount, pagesize):
1637 self._pagefunc = pagefunc
1638 self._pagecount = pagecount
1639 self._pagesize = pagesize
1641 def getslice(self, start=0, end=None):
1643 start_page = start // self._pagesize
1645 self._pagecount if end is None else (end // self._pagesize + 1))
1646 skip_elems = start - start_page * self._pagesize
1647 only_more = None if end is None else end - start
1648 for pagenum in range(start_page, end_page):
1649 page = list(self._pagefunc(pagenum))
1651 page = page[skip_elems:]
1653 if only_more is not None:
1654 if len(page) < only_more:
1655 only_more -= len(page)
1657 page = page[:only_more]
1664 def uppercase_escape(s):
1665 unicode_escape = codecs.getdecoder('unicode_escape')
1667 r'\\U[0-9a-fA-F]{8}',
1668 lambda m: unicode_escape(m.group(0))[0],
1672 def lowercase_escape(s):
1673 unicode_escape = codecs.getdecoder('unicode_escape')
1675 r'\\u[0-9a-fA-F]{4}',
1676 lambda m: unicode_escape(m.group(0))[0],
1680 def escape_rfc3986(s):
1681 """Escape non-ASCII characters as suggested by RFC 3986"""
1682 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1683 s = s.encode('utf-8')
1684 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1687 def escape_url(url):
1688 """Escape URL as suggested by RFC 3986"""
1689 url_parsed = compat_urllib_parse_urlparse(url)
1690 return url_parsed._replace(
1691 path=escape_rfc3986(url_parsed.path),
1692 params=escape_rfc3986(url_parsed.params),
1693 query=escape_rfc3986(url_parsed.query),
1694 fragment=escape_rfc3986(url_parsed.fragment)
1698 struct.pack('!I', 0)
1700 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1701 def struct_pack(spec, *args):
1702 if isinstance(spec, compat_str):
1703 spec = spec.encode('ascii')
1704 return struct.pack(spec, *args)
1706 def struct_unpack(spec, *args):
1707 if isinstance(spec, compat_str):
1708 spec = spec.encode('ascii')
1709 return struct.unpack(spec, *args)
1711 struct_pack = struct.pack
1712 struct_unpack = struct.unpack
1715 def read_batch_urls(batch_fd):
1717 if not isinstance(url, compat_str):
1718 url = url.decode('utf-8', 'replace')
1719 BOM_UTF8 = '\xef\xbb\xbf'
1720 if url.startswith(BOM_UTF8):
1721 url = url[len(BOM_UTF8):]
1723 if url.startswith(('#', ';', ']')):
1727 with contextlib.closing(batch_fd) as fd:
1728 return [url for url in map(fixup, fd) if url]
1731 def urlencode_postdata(*args, **kargs):
1732 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1735 def encode_dict(d, encoding='utf-8'):
1737 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1738 return dict((encode(k), encode(v)) for k, v in d.items())
1741 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1742 if isinstance(key_or_keys, (list, tuple)):
1743 for key in key_or_keys:
1744 if key not in d or d[key] is None or skip_false_values and not d[key]:
1748 return d.get(key_or_keys, default)
1751 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1752 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1764 def parse_age_limit(s):
1767 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1768 return int(m.group('age')) if m else US_RATINGS.get(s)
1771 def strip_jsonp(code):
1773 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1776 def js_to_json(code):
1779 if v in ('true', 'false', 'null'):
1781 if v.startswith('"'):
1782 v = re.sub(r"\\'", "'", v[1:-1])
1783 elif v.startswith("'"):
1785 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1792 res = re.sub(r'''(?x)
1793 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1794 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1795 [a-zA-Z_][.a-zA-Z_0-9]*
1797 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1801 def qualities(quality_ids):
1802 """ Get a numeric quality value out of a list of possible values """
1805 return quality_ids.index(qid)
1811 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1814 def limit_length(s, length):
1815 """ Add ellipses to overly long strings """
1820 return s[:length - len(ELLIPSES)] + ELLIPSES
1824 def version_tuple(v):
1825 return tuple(int(e) for e in re.split(r'[-.]', v))
1828 def is_outdated_version(version, limit, assume_new=True):
1830 return not assume_new
1832 return version_tuple(version) < version_tuple(limit)
1834 return not assume_new
1837 def ytdl_is_updateable():
1838 """ Returns if youtube-dl can be updated with -U """
1839 from zipimport import zipimporter
1841 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1844 def args_to_str(args):
1845 # Get a short string representation for a subprocess command
1846 return ' '.join(shlex_quote(a) for a in args)
1849 def error_to_compat_str(err):
1851 # On python 2 error byte string must be decoded with proper
1852 # encoding rather than ascii
1853 if sys.version_info[0] < 3:
1854 err_str = err_str.decode(preferredencoding())
1858 def mimetype2ext(mt):
1865 _, _, res = mt.rpartition('/')
1871 'x-mp4-fragmented': 'mp4',
1876 def urlhandle_detect_ext(url_handle):
1879 getheader = lambda h: url_handle.headers[h]
1880 except AttributeError: # Python < 3
1881 getheader = url_handle.info().getheader
1883 cd = getheader('Content-Disposition')
1885 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1887 e = determine_ext(m.group('filename'), default_ext=None)
1891 return mimetype2ext(getheader('Content-Type'))
1894 def encode_data_uri(data, mime_type):
1895 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1898 def age_restricted(content_limit, age_limit):
1899 """ Returns True iff the content should be blocked """
1901 if age_limit is None: # No limit set
1903 if content_limit is None:
1904 return False # Content available for everyone
1905 return age_limit < content_limit
1908 def is_html(first_bytes):
1909 """ Detect whether a file contains HTML by examining its first bytes. """
1912 (b'\xef\xbb\xbf', 'utf-8'),
1913 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1914 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1915 (b'\xff\xfe', 'utf-16-le'),
1916 (b'\xfe\xff', 'utf-16-be'),
1918 for bom, enc in BOMS:
1919 if first_bytes.startswith(bom):
1920 s = first_bytes[len(bom):].decode(enc, 'replace')
1923 s = first_bytes.decode('utf-8', 'replace')
1925 return re.match(r'^\s*<', s)
1928 def determine_protocol(info_dict):
1929 protocol = info_dict.get('protocol')
1930 if protocol is not None:
1933 url = info_dict['url']
1934 if url.startswith('rtmp'):
1936 elif url.startswith('mms'):
1938 elif url.startswith('rtsp'):
1941 ext = determine_ext(url)
1947 return compat_urllib_parse_urlparse(url).scheme
1950 def render_table(header_row, data):
1951 """ Render a list of rows, each as a list of values """
1952 table = [header_row] + data
1953 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1954 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1955 return '\n'.join(format_str % tuple(row) for row in table)
1958 def _match_one(filter_part, dct):
1959 COMPARISON_OPERATORS = {
1967 operator_rex = re.compile(r'''(?x)\s*
1969 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1971 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1972 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1975 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1976 m = operator_rex.search(filter_part)
1978 op = COMPARISON_OPERATORS[m.group('op')]
1979 if m.group('strval') is not None:
1980 if m.group('op') not in ('=', '!='):
1982 'Operator %s does not support string values!' % m.group('op'))
1983 comparison_value = m.group('strval')
1986 comparison_value = int(m.group('intval'))
1988 comparison_value = parse_filesize(m.group('intval'))
1989 if comparison_value is None:
1990 comparison_value = parse_filesize(m.group('intval') + 'B')
1991 if comparison_value is None:
1993 'Invalid integer value %r in filter part %r' % (
1994 m.group('intval'), filter_part))
1995 actual_value = dct.get(m.group('key'))
1996 if actual_value is None:
1997 return m.group('none_inclusive')
1998 return op(actual_value, comparison_value)
2001 '': lambda v: v is not None,
2002 '!': lambda v: v is None,
2004 operator_rex = re.compile(r'''(?x)\s*
2005 (?P<op>%s)\s*(?P<key>[a-z_]+)
2007 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2008 m = operator_rex.search(filter_part)
2010 op = UNARY_OPERATORS[m.group('op')]
2011 actual_value = dct.get(m.group('key'))
2012 return op(actual_value)
2014 raise ValueError('Invalid filter part %r' % filter_part)
2017 def match_str(filter_str, dct):
2018 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2021 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2024 def match_filter_func(filter_str):
2025 def _match_func(info_dict):
2026 if match_str(filter_str, info_dict):
2029 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2030 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2034 def parse_dfxp_time_expr(time_expr):
2038 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2040 return float(mobj.group('time_offset'))
2042 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2044 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2047 def srt_subtitles_timecode(seconds):
2048 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2051 def dfxp2srt(dfxp_data):
2052 _x = functools.partial(xpath_with_ns, ns_map={
2053 'ttml': 'http://www.w3.org/ns/ttml',
2054 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2057 class TTMLPElementParser(object):
2060 def start(self, tag, attrib):
2061 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2067 def data(self, data):
2071 return self.out.strip()
2073 def parse_node(node):
2074 target = TTMLPElementParser()
2075 parser = xml.etree.ElementTree.XMLParser(target=target)
2076 parser.feed(xml.etree.ElementTree.tostring(node))
2077 return parser.close()
2079 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2081 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2084 raise ValueError('Invalid dfxp/TTML subtitle')
2086 for para, index in zip(paras, itertools.count(1)):
2087 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2088 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2089 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2090 if begin_time is None:
2095 end_time = begin_time + dur
2096 out.append('%d\n%s --> %s\n%s\n\n' % (
2098 srt_subtitles_timecode(begin_time),
2099 srt_subtitles_timecode(end_time),
2105 def cli_option(params, command_option, param):
2106 param = params.get(param)
2107 return [command_option, param] if param is not None else []
2110 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2111 param = params.get(param)
2112 assert isinstance(param, bool)
2114 return [command_option + separator + (true_value if param else false_value)]
2115 return [command_option, true_value if param else false_value]
2118 def cli_valueless_option(params, command_option, param, expected_value=True):
2119 param = params.get(param)
2120 return [command_option] if param == expected_value else []
2123 def cli_configuration_args(params, param, default=[]):
2124 ex_args = params.get(param)
2127 assert isinstance(ex_args, list)
2131 class ISO639Utils(object):
2132 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2321 def short2long(cls, code):
2322 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2323 return cls._lang_map.get(code[:2])
2326 def long2short(cls, code):
2327 """Convert language code from ISO 639-2/T to ISO 639-1"""
2328 for short_name, long_name in cls._lang_map.items():
2329 if long_name == code:
2333 class ISO3166Utils(object):
2334 # From http://data.okfn.org/data/core/country-list
2336 'AF': 'Afghanistan',
2337 'AX': 'Åland Islands',
2340 'AS': 'American Samoa',
2345 'AG': 'Antigua and Barbuda',
2362 'BO': 'Bolivia, Plurinational State of',
2363 'BQ': 'Bonaire, Sint Eustatius and Saba',
2364 'BA': 'Bosnia and Herzegovina',
2366 'BV': 'Bouvet Island',
2368 'IO': 'British Indian Ocean Territory',
2369 'BN': 'Brunei Darussalam',
2371 'BF': 'Burkina Faso',
2377 'KY': 'Cayman Islands',
2378 'CF': 'Central African Republic',
2382 'CX': 'Christmas Island',
2383 'CC': 'Cocos (Keeling) Islands',
2387 'CD': 'Congo, the Democratic Republic of the',
2388 'CK': 'Cook Islands',
2390 'CI': 'Côte d\'Ivoire',
2395 'CZ': 'Czech Republic',
2399 'DO': 'Dominican Republic',
2402 'SV': 'El Salvador',
2403 'GQ': 'Equatorial Guinea',
2407 'FK': 'Falkland Islands (Malvinas)',
2408 'FO': 'Faroe Islands',
2412 'GF': 'French Guiana',
2413 'PF': 'French Polynesia',
2414 'TF': 'French Southern Territories',
2429 'GW': 'Guinea-Bissau',
2432 'HM': 'Heard Island and McDonald Islands',
2433 'VA': 'Holy See (Vatican City State)',
2440 'IR': 'Iran, Islamic Republic of',
2443 'IM': 'Isle of Man',
2453 'KP': 'Korea, Democratic People\'s Republic of',
2454 'KR': 'Korea, Republic of',
2457 'LA': 'Lao People\'s Democratic Republic',
2463 'LI': 'Liechtenstein',
2467 'MK': 'Macedonia, the Former Yugoslav Republic of',
2474 'MH': 'Marshall Islands',
2480 'FM': 'Micronesia, Federated States of',
2481 'MD': 'Moldova, Republic of',
2492 'NL': 'Netherlands',
2493 'NC': 'New Caledonia',
2494 'NZ': 'New Zealand',
2499 'NF': 'Norfolk Island',
2500 'MP': 'Northern Mariana Islands',
2505 'PS': 'Palestine, State of',
2507 'PG': 'Papua New Guinea',
2510 'PH': 'Philippines',
2514 'PR': 'Puerto Rico',
2518 'RU': 'Russian Federation',
2520 'BL': 'Saint Barthélemy',
2521 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2522 'KN': 'Saint Kitts and Nevis',
2523 'LC': 'Saint Lucia',
2524 'MF': 'Saint Martin (French part)',
2525 'PM': 'Saint Pierre and Miquelon',
2526 'VC': 'Saint Vincent and the Grenadines',
2529 'ST': 'Sao Tome and Principe',
2530 'SA': 'Saudi Arabia',
2534 'SL': 'Sierra Leone',
2536 'SX': 'Sint Maarten (Dutch part)',
2539 'SB': 'Solomon Islands',
2541 'ZA': 'South Africa',
2542 'GS': 'South Georgia and the South Sandwich Islands',
2543 'SS': 'South Sudan',
2548 'SJ': 'Svalbard and Jan Mayen',
2551 'CH': 'Switzerland',
2552 'SY': 'Syrian Arab Republic',
2553 'TW': 'Taiwan, Province of China',
2555 'TZ': 'Tanzania, United Republic of',
2557 'TL': 'Timor-Leste',
2561 'TT': 'Trinidad and Tobago',
2564 'TM': 'Turkmenistan',
2565 'TC': 'Turks and Caicos Islands',
2569 'AE': 'United Arab Emirates',
2570 'GB': 'United Kingdom',
2571 'US': 'United States',
2572 'UM': 'United States Minor Outlying Islands',
2576 'VE': 'Venezuela, Bolivarian Republic of',
2578 'VG': 'Virgin Islands, British',
2579 'VI': 'Virgin Islands, U.S.',
2580 'WF': 'Wallis and Futuna',
2581 'EH': 'Western Sahara',
2588 def short2full(cls, code):
2589 """Convert an ISO 3166-2 country code to the corresponding full name"""
2590 return cls._country_map.get(code.upper())
2593 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2594 def __init__(self, proxies=None):
2595 # Set default handlers
2596 for type in ('http', 'https'):
2597 setattr(self, '%s_open' % type,
2598 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2599 meth(r, proxy, type))
2600 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2602 def proxy_open(self, req, proxy, type):
2603 req_proxy = req.headers.get('Ytdl-request-proxy')
2604 if req_proxy is not None:
2606 del req.headers['Ytdl-request-proxy']
2608 if proxy == '__noproxy__':
2609 return None # No Proxy
2610 return compat_urllib_request.ProxyHandler.proxy_open(
2611 self, req, proxy, type)
2614 def ohdave_rsa_encrypt(data, exponent, modulus):
2616 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2619 data: data to encrypt, bytes-like object
2620 exponent, modulus: parameter e and N of RSA algorithm, both integer
2621 Output: hex string of encrypted data
2623 Limitation: supports one block encryption only
2626 payload = int(binascii.hexlify(data[::-1]), 16)
2627 encrypted = pow(payload, exponent, modulus)
2628 return '%x' % encrypted