2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
43 compat_socket_create_connection,
47 compat_urllib_parse_urlparse,
48 compat_urllib_request,
54 # This is not clearly defined otherwise
55 compiled_regex_type = type(re.compile(''))
58 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
60 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
61 'Accept-Encoding': 'gzip, deflate',
62 'Accept-Language': 'en-us,en;q=0.5',
68 ENGLISH_MONTH_NAMES = [
69 'January', 'February', 'March', 'April', 'May', 'June',
70 'July', 'August', 'September', 'October', 'November', 'December']
73 def preferredencoding():
74 """Get preferred encoding.
76 Returns the best encoding scheme for the system, based on
77 locale.getpreferredencoding() and some further tweaks.
80 pref = locale.getpreferredencoding()
88 def write_json_file(obj, fn):
89 """ Encode obj as JSON and write it to fn, atomically if possible """
91 fn = encodeFilename(fn)
92 if sys.version_info < (3, 0) and sys.platform != 'win32':
93 encoding = get_filesystem_encoding()
94 # os.path.basename returns a bytes object, but NamedTemporaryFile
95 # will fail if the filename contains non ascii characters unless we
96 # use a unicode object
97 path_basename = lambda f: os.path.basename(fn).decode(encoding)
98 # the same for os.path.dirname
99 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
101 path_basename = os.path.basename
102 path_dirname = os.path.dirname
106 'prefix': path_basename(fn) + '.',
107 'dir': path_dirname(fn),
111 # In Python 2.x, json.dump expects a bytestream.
112 # In Python 3.x, it writes to a character stream
113 if sys.version_info < (3, 0):
121 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
126 if sys.platform == 'win32':
127 # Need to remove existing file on Windows, else os.rename raises
128 # WindowsError or FileExistsError.
133 os.rename(tf.name, fn)
142 if sys.version_info >= (2, 7):
143 def find_xpath_attr(node, xpath, key, val=None):
144 """ Find the xpath xpath[@key=val] """
145 assert re.match(r'^[a-zA-Z_-]+$', key)
147 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
148 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
149 return node.find(expr)
151 def find_xpath_attr(node, xpath, key, val=None):
152 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
153 # .//node does not match if a node is a direct child of . !
154 if isinstance(xpath, compat_str):
155 xpath = xpath.encode('ascii')
157 for f in node.findall(xpath):
158 if key not in f.attrib:
160 if val is None or f.attrib.get(key) == val:
164 # On python2.6 the xml.etree.ElementTree.Element methods don't support
165 # the namespace parameter
168 def xpath_with_ns(path, ns_map):
169 components = [c.split(':') for c in path.split('/')]
173 replaced.append(c[0])
176 replaced.append('{%s}%s' % (ns_map[ns], tag))
177 return '/'.join(replaced)
180 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
181 if sys.version_info < (2, 7): # Crazy 2.6
182 xpath = xpath.encode('ascii')
186 if default is not NO_DEFAULT:
189 name = xpath if name is None else name
190 raise ExtractorError('Could not find XML element %s' % name)
196 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
197 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
198 if n is None or n == default:
201 if default is not NO_DEFAULT:
204 name = xpath if name is None else name
205 raise ExtractorError('Could not find XML element\'s text %s' % name)
211 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
212 n = find_xpath_attr(node, xpath, key)
214 if default is not NO_DEFAULT:
217 name = '%s[@%s]' % (xpath, key) if name is None else name
218 raise ExtractorError('Could not find XML attribute %s' % name)
224 def get_element_by_id(id, html):
225 """Return the content of the tag with the specified ID in the passed HTML document"""
226 return get_element_by_attribute("id", id, html)
229 def get_element_by_attribute(attribute, value, html):
230 """Return the content of the tag with the specified attribute in the passed HTML document"""
232 m = re.search(r'''(?xs)
234 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
236 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
240 ''' % (re.escape(attribute), re.escape(value)), html)
244 res = m.group('content')
246 if res.startswith('"') or res.startswith("'"):
249 return unescapeHTML(res)
252 def clean_html(html):
253 """Clean an HTML snippet into a readable string"""
255 if html is None: # Convenience for sanitizing descriptions etc.
259 html = html.replace('\n', ' ')
260 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
261 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
263 html = re.sub('<.*?>', '', html)
264 # Replace html entities
265 html = unescapeHTML(html)
269 def sanitize_open(filename, open_mode):
270 """Try to open the given filename, and slightly tweak it if this fails.
272 Attempts to open the given filename. If this fails, it tries to change
273 the filename slightly, step by step, until it's either able to open it
274 or it fails and raises a final exception, like the standard open()
277 It returns the tuple (stream, definitive_file_name).
281 if sys.platform == 'win32':
283 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
284 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
285 stream = open(encodeFilename(filename), open_mode)
286 return (stream, filename)
287 except (IOError, OSError) as err:
288 if err.errno in (errno.EACCES,):
291 # In case of error, try to remove win32 forbidden chars
292 alt_filename = sanitize_path(filename)
293 if alt_filename == filename:
296 # An exception here should be caught in the caller
297 stream = open(encodeFilename(alt_filename), open_mode)
298 return (stream, alt_filename)
301 def timeconvert(timestr):
302 """Convert RFC 2822 defined time string into system timestamp"""
304 timetuple = email.utils.parsedate_tz(timestr)
305 if timetuple is not None:
306 timestamp = email.utils.mktime_tz(timetuple)
310 def sanitize_filename(s, restricted=False, is_id=False):
311 """Sanitizes a string so it could be used as part of a filename.
312 If restricted is set, use a stricter subset of allowed characters.
313 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
315 def replace_insane(char):
316 if char == '?' or ord(char) < 32 or ord(char) == 127:
319 return '' if restricted else '\''
321 return '_-' if restricted else ' -'
322 elif char in '\\/|*<>':
324 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
326 if restricted and ord(char) > 127:
331 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
332 result = ''.join(map(replace_insane, s))
334 while '__' in result:
335 result = result.replace('__', '_')
336 result = result.strip('_')
337 # Common case of "Foreign band name - English song title"
338 if restricted and result.startswith('-_'):
340 if result.startswith('-'):
341 result = '_' + result[len('-'):]
342 result = result.lstrip('.')
348 def sanitize_path(s):
349 """Sanitizes and normalizes path on Windows"""
350 if sys.platform != 'win32':
352 drive_or_unc, _ = os.path.splitdrive(s)
353 if sys.version_info < (2, 7) and not drive_or_unc:
354 drive_or_unc, _ = os.path.splitunc(s)
355 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
359 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
360 for path_part in norm_path]
362 sanitized_path.insert(0, drive_or_unc + os.path.sep)
363 return os.path.join(*sanitized_path)
366 def orderedSet(iterable):
367 """ Remove all duplicates from the input iterable """
375 def _htmlentity_transform(entity):
376 """Transforms an HTML entity to a character."""
377 # Known non-numeric HTML entity
378 if entity in compat_html_entities.name2codepoint:
379 return compat_chr(compat_html_entities.name2codepoint[entity])
381 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
383 numstr = mobj.group(1)
384 if numstr.startswith('x'):
386 numstr = '0%s' % numstr
389 return compat_chr(int(numstr, base))
391 # Unknown entity in name, return its literal representation
392 return ('&%s;' % entity)
398 assert type(s) == compat_str
401 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
404 def get_subprocess_encoding():
405 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
406 # For subprocess calls, encode with locale encoding
407 # Refer to http://stackoverflow.com/a/9951851/35070
408 encoding = preferredencoding()
410 encoding = sys.getfilesystemencoding()
416 def encodeFilename(s, for_subprocess=False):
418 @param s The name of the file
421 assert type(s) == compat_str
423 # Python 3 has a Unicode API
424 if sys.version_info >= (3, 0):
427 # Pass '' directly to use Unicode APIs on Windows 2000 and up
428 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
429 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
430 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
433 return s.encode(get_subprocess_encoding(), 'ignore')
436 def decodeFilename(b, for_subprocess=False):
438 if sys.version_info >= (3, 0):
441 if not isinstance(b, bytes):
444 return b.decode(get_subprocess_encoding(), 'ignore')
447 def encodeArgument(s):
448 if not isinstance(s, compat_str):
449 # Legacy code that uses byte strings
450 # Uncomment the following line after fixing all post processors
451 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
452 s = s.decode('ascii')
453 return encodeFilename(s, True)
456 def decodeArgument(b):
457 return decodeFilename(b, True)
460 def decodeOption(optval):
463 if isinstance(optval, bytes):
464 optval = optval.decode(preferredencoding())
466 assert isinstance(optval, compat_str)
470 def formatSeconds(secs):
472 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
474 return '%d:%02d' % (secs // 60, secs % 60)
479 def make_HTTPS_handler(params, **kwargs):
480 opts_no_check_certificate = params.get('nocheckcertificate', False)
481 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
482 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
483 if opts_no_check_certificate:
484 context.check_hostname = False
485 context.verify_mode = ssl.CERT_NONE
487 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
490 # (create_default_context present but HTTPSHandler has no context=)
493 if sys.version_info < (3, 2):
494 return YoutubeDLHTTPSHandler(params, **kwargs)
496 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
497 context.verify_mode = (ssl.CERT_NONE
498 if opts_no_check_certificate
499 else ssl.CERT_REQUIRED)
500 context.set_default_verify_paths()
501 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
504 def bug_reports_message():
505 if ytdl_is_updateable():
506 update_cmd = 'type youtube-dl -U to update'
508 update_cmd = 'see https://yt-dl.org/update on how to update'
509 msg = '; please report this issue on https://yt-dl.org/bug .'
510 msg += ' Make sure you are using the latest version; %s.' % update_cmd
511 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
515 class ExtractorError(Exception):
516 """Error during info extraction."""
518 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
519 """ tb, if given, is the original traceback (so that it can be printed out).
520 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
523 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
525 if video_id is not None:
526 msg = video_id + ': ' + msg
528 msg += ' (caused by %r)' % cause
530 msg += bug_reports_message()
531 super(ExtractorError, self).__init__(msg)
534 self.exc_info = sys.exc_info() # preserve original exception
536 self.video_id = video_id
538 def format_traceback(self):
539 if self.traceback is None:
541 return ''.join(traceback.format_tb(self.traceback))
544 class UnsupportedError(ExtractorError):
545 def __init__(self, url):
546 super(UnsupportedError, self).__init__(
547 'Unsupported URL: %s' % url, expected=True)
551 class RegexNotFoundError(ExtractorError):
552 """Error when a regex didn't match"""
556 class DownloadError(Exception):
557 """Download Error exception.
559 This exception may be thrown by FileDownloader objects if they are not
560 configured to continue on errors. They will contain the appropriate
564 def __init__(self, msg, exc_info=None):
565 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
566 super(DownloadError, self).__init__(msg)
567 self.exc_info = exc_info
570 class SameFileError(Exception):
571 """Same File exception.
573 This exception will be thrown by FileDownloader objects if they detect
574 multiple files would have to be downloaded to the same file on disk.
579 class PostProcessingError(Exception):
580 """Post Processing exception.
582 This exception may be raised by PostProcessor's .run() method to
583 indicate an error in the postprocessing task.
586 def __init__(self, msg):
590 class MaxDownloadsReached(Exception):
591 """ --max-downloads limit has been reached. """
595 class UnavailableVideoError(Exception):
596 """Unavailable Format exception.
598 This exception will be thrown when a video is requested
599 in a format that is not available for that video.
604 class ContentTooShortError(Exception):
605 """Content Too Short exception.
607 This exception may be raised by FileDownloader objects when a file they
608 download is too small for what the server announced first, indicating
609 the connection was probably interrupted.
612 def __init__(self, downloaded, expected):
614 self.downloaded = downloaded
615 self.expected = expected
618 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
619 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
620 # expected HTTP responses to meet HTTP/1.0 or later (see also
621 # https://github.com/rg3/youtube-dl/issues/6727)
622 if sys.version_info < (3, 0):
623 kwargs[b'strict'] = True
624 hc = http_class(*args, **kwargs)
625 source_address = ydl_handler._params.get('source_address')
626 if source_address is not None:
627 sa = (source_address, 0)
628 if hasattr(hc, 'source_address'): # Python 2.7+
629 hc.source_address = sa
631 def _hc_connect(self, *args, **kwargs):
632 sock = compat_socket_create_connection(
633 (self.host, self.port), self.timeout, sa)
635 self.sock = ssl.wrap_socket(
636 sock, self.key_file, self.cert_file,
637 ssl_version=ssl.PROTOCOL_TLSv1)
640 hc.connect = functools.partial(_hc_connect, hc)
645 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
646 """Handler for HTTP requests and responses.
648 This class, when installed with an OpenerDirector, automatically adds
649 the standard headers to every HTTP request and handles gzipped and
650 deflated responses from web servers. If compression is to be avoided in
651 a particular request, the original request in the program code only has
652 to include the HTTP header "Youtubedl-No-Compression", which will be
653 removed before making the real request.
655 Part of this code was copied from:
657 http://techknack.net/python-urllib2-handlers/
659 Andrew Rowls, the author of that code, agreed to release it to the
663 def __init__(self, params, *args, **kwargs):
664 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
665 self._params = params
667 def http_open(self, req):
668 return self.do_open(functools.partial(
669 _create_http_connection, self, compat_http_client.HTTPConnection, False),
675 return zlib.decompress(data, -zlib.MAX_WBITS)
677 return zlib.decompress(data)
680 def addinfourl_wrapper(stream, headers, url, code):
681 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
682 return compat_urllib_request.addinfourl(stream, headers, url, code)
683 ret = compat_urllib_request.addinfourl(stream, headers, url)
687 def http_request(self, req):
688 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
689 # always respected by websites, some tend to give out URLs with non percent-encoded
690 # non-ASCII characters (see telemb.py, ard.py [#3412])
691 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
692 # To work around aforementioned issue we will replace request's original URL with
693 # percent-encoded one
694 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
695 # the code of this workaround has been moved here from YoutubeDL.urlopen()
696 url = req.get_full_url()
697 url_escaped = escape_url(url)
699 # Substitute URL if any change after escaping
700 if url != url_escaped:
701 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
703 url_escaped, data=req.data, headers=req.headers,
704 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
705 new_req.timeout = req.timeout
708 for h, v in std_headers.items():
709 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
710 # The dict keys are capitalized because of this bug by urllib
711 if h.capitalize() not in req.headers:
713 if 'Youtubedl-no-compression' in req.headers:
714 if 'Accept-encoding' in req.headers:
715 del req.headers['Accept-encoding']
716 del req.headers['Youtubedl-no-compression']
718 if sys.version_info < (2, 7) and '#' in req.get_full_url():
719 # Python 2.6 is brain-dead when it comes to fragments
720 req._Request__original = req._Request__original.partition('#')[0]
721 req._Request__r_type = req._Request__r_type.partition('#')[0]
725 def http_response(self, req, resp):
728 if resp.headers.get('Content-encoding', '') == 'gzip':
729 content = resp.read()
730 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
732 uncompressed = io.BytesIO(gz.read())
733 except IOError as original_ioerror:
734 # There may be junk add the end of the file
735 # See http://stackoverflow.com/q/4928560/35070 for details
736 for i in range(1, 1024):
738 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
739 uncompressed = io.BytesIO(gz.read())
744 raise original_ioerror
745 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
746 resp.msg = old_resp.msg
748 if resp.headers.get('Content-encoding', '') == 'deflate':
749 gz = io.BytesIO(self.deflate(resp.read()))
750 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
751 resp.msg = old_resp.msg
752 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
753 # https://github.com/rg3/youtube-dl/issues/6457).
754 if 300 <= resp.code < 400:
755 location = resp.headers.get('Location')
757 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
758 if sys.version_info >= (3, 0):
759 location = location.encode('iso-8859-1').decode('utf-8')
760 location_escaped = escape_url(location)
761 if location != location_escaped:
762 del resp.headers['Location']
763 resp.headers['Location'] = location_escaped
766 https_request = http_request
767 https_response = http_response
770 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
771 def __init__(self, params, https_conn_class=None, *args, **kwargs):
772 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
773 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
774 self._params = params
776 def https_open(self, req):
778 if hasattr(self, '_context'): # python > 2.6
779 kwargs['context'] = self._context
780 if hasattr(self, '_check_hostname'): # python 3.x
781 kwargs['check_hostname'] = self._check_hostname
782 return self.do_open(functools.partial(
783 _create_http_connection, self, self._https_conn_class, True),
787 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
788 def __init__(self, cookiejar=None):
789 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
791 def http_response(self, request, response):
792 # Python 2 will choke on next HTTP request in row if there are non-ASCII
793 # characters in Set-Cookie HTTP header of last response (see
794 # https://github.com/rg3/youtube-dl/issues/6769).
795 # In order to at least prevent crashing we will percent encode Set-Cookie
796 # header before HTTPCookieProcessor starts processing it.
797 # if sys.version_info < (3, 0) and response.headers:
798 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
799 # set_cookie = response.headers.get(set_cookie_header)
801 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
802 # if set_cookie != set_cookie_escaped:
803 # del response.headers[set_cookie_header]
804 # response.headers[set_cookie_header] = set_cookie_escaped
805 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
807 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
808 https_response = http_response
811 def parse_iso8601(date_str, delimiter='T', timezone=None):
812 """ Return a UNIX timestamp from the given date """
817 date_str = re.sub(r'\.[0-9]+', '', date_str)
821 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
824 timezone = datetime.timedelta()
826 date_str = date_str[:-len(m.group(0))]
827 if not m.group('sign'):
828 timezone = datetime.timedelta()
830 sign = 1 if m.group('sign') == '+' else -1
831 timezone = datetime.timedelta(
832 hours=sign * int(m.group('hours')),
833 minutes=sign * int(m.group('minutes')))
835 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
836 dt = datetime.datetime.strptime(date_str, date_format) - timezone
837 return calendar.timegm(dt.timetuple())
842 def unified_strdate(date_str, day_first=True):
843 """Return a string with the date in the format YYYYMMDD"""
849 date_str = date_str.replace(',', ' ')
850 # %z (UTC offset) is only supported in python>=3.2
851 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
852 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
853 # Remove AM/PM + timezone
854 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
856 format_expressions = [
861 '%b %dst %Y %I:%M%p',
862 '%b %dnd %Y %I:%M%p',
863 '%b %dth %Y %I:%M%p',
869 '%Y-%m-%d %H:%M:%S.%f',
872 '%Y-%m-%dT%H:%M:%SZ',
873 '%Y-%m-%dT%H:%M:%S.%fZ',
874 '%Y-%m-%dT%H:%M:%S.%f0Z',
876 '%Y-%m-%dT%H:%M:%S.%f',
880 format_expressions.extend([
888 format_expressions.extend([
895 for expression in format_expressions:
897 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
900 if upload_date is None:
901 timetuple = email.utils.parsedate_tz(date_str)
903 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
907 def determine_ext(url, default_ext='unknown_video'):
910 guess = url.partition('?')[0].rpartition('.')[2]
911 if re.match(r'^[A-Za-z0-9]+$', guess):
917 def subtitles_filename(filename, sub_lang, sub_format):
918 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
921 def date_from_str(date_str):
923 Return a datetime object from a string in the format YYYYMMDD or
924 (now|today)[+-][0-9](day|week|month|year)(s)?"""
925 today = datetime.date.today()
926 if date_str in ('now', 'today'):
928 if date_str == 'yesterday':
929 return today - datetime.timedelta(days=1)
930 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
931 if match is not None:
932 sign = match.group('sign')
933 time = int(match.group('time'))
936 unit = match.group('unit')
937 # A bad aproximation?
945 delta = datetime.timedelta(**{unit: time})
947 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
950 def hyphenate_date(date_str):
952 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
953 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
954 if match is not None:
955 return '-'.join(match.groups())
960 class DateRange(object):
961 """Represents a time interval between two dates"""
963 def __init__(self, start=None, end=None):
964 """start and end must be strings in the format accepted by date"""
965 if start is not None:
966 self.start = date_from_str(start)
968 self.start = datetime.datetime.min.date()
970 self.end = date_from_str(end)
972 self.end = datetime.datetime.max.date()
973 if self.start > self.end:
974 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
978 """Returns a range that only contains the given day"""
981 def __contains__(self, date):
982 """Check if the date is in the range"""
983 if not isinstance(date, datetime.date):
984 date = date_from_str(date)
985 return self.start <= date <= self.end
988 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
992 """ Returns the platform name as a compat_str """
993 res = platform.platform()
994 if isinstance(res, bytes):
995 res = res.decode(preferredencoding())
997 assert isinstance(res, compat_str)
1001 def _windows_write_string(s, out):
1002 """ Returns True if the string was written using special methods,
1003 False if it has yet to be written out."""
1004 # Adapted from http://stackoverflow.com/a/3259271/35070
1007 import ctypes.wintypes
1015 fileno = out.fileno()
1016 except AttributeError:
1017 # If the output stream doesn't have a fileno, it's virtual
1019 except io.UnsupportedOperation:
1020 # Some strange Windows pseudo files?
1022 if fileno not in WIN_OUTPUT_IDS:
1025 GetStdHandle = ctypes.WINFUNCTYPE(
1026 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1027 (b"GetStdHandle", ctypes.windll.kernel32))
1028 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1030 WriteConsoleW = ctypes.WINFUNCTYPE(
1031 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1032 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1033 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1034 written = ctypes.wintypes.DWORD(0)
1036 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1037 FILE_TYPE_CHAR = 0x0002
1038 FILE_TYPE_REMOTE = 0x8000
1039 GetConsoleMode = ctypes.WINFUNCTYPE(
1040 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1041 ctypes.POINTER(ctypes.wintypes.DWORD))(
1042 (b"GetConsoleMode", ctypes.windll.kernel32))
1043 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1045 def not_a_console(handle):
1046 if handle == INVALID_HANDLE_VALUE or handle is None:
1048 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1049 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1051 if not_a_console(h):
1054 def next_nonbmp_pos(s):
1056 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1057 except StopIteration:
1061 count = min(next_nonbmp_pos(s), 1024)
1063 ret = WriteConsoleW(
1064 h, s, count if count else 2, ctypes.byref(written), None)
1066 raise OSError('Failed to write string')
1067 if not count: # We just wrote a non-BMP character
1068 assert written.value == 2
1071 assert written.value > 0
1072 s = s[written.value:]
1076 def write_string(s, out=None, encoding=None):
1079 assert type(s) == compat_str
1081 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1082 if _windows_write_string(s, out):
1085 if ('b' in getattr(out, 'mode', '') or
1086 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1087 byt = s.encode(encoding or preferredencoding(), 'ignore')
1089 elif hasattr(out, 'buffer'):
1090 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1091 byt = s.encode(enc, 'ignore')
1092 out.buffer.write(byt)
1098 def bytes_to_intlist(bs):
1101 if isinstance(bs[0], int): # Python 3
1104 return [ord(c) for c in bs]
1107 def intlist_to_bytes(xs):
1110 return struct_pack('%dB' % len(xs), *xs)
1113 # Cross-platform file locking
1114 if sys.platform == 'win32':
1115 import ctypes.wintypes
1118 class OVERLAPPED(ctypes.Structure):
1120 ('Internal', ctypes.wintypes.LPVOID),
1121 ('InternalHigh', ctypes.wintypes.LPVOID),
1122 ('Offset', ctypes.wintypes.DWORD),
1123 ('OffsetHigh', ctypes.wintypes.DWORD),
1124 ('hEvent', ctypes.wintypes.HANDLE),
1127 kernel32 = ctypes.windll.kernel32
1128 LockFileEx = kernel32.LockFileEx
1129 LockFileEx.argtypes = [
1130 ctypes.wintypes.HANDLE, # hFile
1131 ctypes.wintypes.DWORD, # dwFlags
1132 ctypes.wintypes.DWORD, # dwReserved
1133 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1134 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1135 ctypes.POINTER(OVERLAPPED) # Overlapped
1137 LockFileEx.restype = ctypes.wintypes.BOOL
1138 UnlockFileEx = kernel32.UnlockFileEx
1139 UnlockFileEx.argtypes = [
1140 ctypes.wintypes.HANDLE, # hFile
1141 ctypes.wintypes.DWORD, # dwReserved
1142 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1143 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1144 ctypes.POINTER(OVERLAPPED) # Overlapped
1146 UnlockFileEx.restype = ctypes.wintypes.BOOL
1147 whole_low = 0xffffffff
1148 whole_high = 0x7fffffff
1150 def _lock_file(f, exclusive):
1151 overlapped = OVERLAPPED()
1152 overlapped.Offset = 0
1153 overlapped.OffsetHigh = 0
1154 overlapped.hEvent = 0
1155 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1156 handle = msvcrt.get_osfhandle(f.fileno())
1157 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1158 whole_low, whole_high, f._lock_file_overlapped_p):
1159 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1161 def _unlock_file(f):
1162 assert f._lock_file_overlapped_p
1163 handle = msvcrt.get_osfhandle(f.fileno())
1164 if not UnlockFileEx(handle, 0,
1165 whole_low, whole_high, f._lock_file_overlapped_p):
1166 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1171 def _lock_file(f, exclusive):
1172 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1174 def _unlock_file(f):
1175 fcntl.flock(f, fcntl.LOCK_UN)
1178 class locked_file(object):
1179 def __init__(self, filename, mode, encoding=None):
1180 assert mode in ['r', 'a', 'w']
1181 self.f = io.open(filename, mode, encoding=encoding)
1184 def __enter__(self):
1185 exclusive = self.mode != 'r'
1187 _lock_file(self.f, exclusive)
1193 def __exit__(self, etype, value, traceback):
1195 _unlock_file(self.f)
1202 def write(self, *args):
1203 return self.f.write(*args)
1205 def read(self, *args):
1206 return self.f.read(*args)
1209 def get_filesystem_encoding():
1210 encoding = sys.getfilesystemencoding()
1211 return encoding if encoding is not None else 'utf-8'
1214 def shell_quote(args):
1216 encoding = get_filesystem_encoding()
1218 if isinstance(a, bytes):
1219 # We may get a filename encoded with 'encodeFilename'
1220 a = a.decode(encoding)
1221 quoted_args.append(pipes.quote(a))
1222 return ' '.join(quoted_args)
1225 def smuggle_url(url, data):
1226 """ Pass additional data in a URL for internal use. """
1228 sdata = compat_urllib_parse.urlencode(
1229 {'__youtubedl_smuggle': json.dumps(data)})
1230 return url + '#' + sdata
1233 def unsmuggle_url(smug_url, default=None):
1234 if '#__youtubedl_smuggle' not in smug_url:
1235 return smug_url, default
1236 url, _, sdata = smug_url.rpartition('#')
1237 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1238 data = json.loads(jsond)
1242 def format_bytes(bytes):
1245 if type(bytes) is str:
1246 bytes = float(bytes)
1250 exponent = int(math.log(bytes, 1024.0))
1251 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1252 converted = float(bytes) / float(1024 ** exponent)
1253 return '%.2f%s' % (converted, suffix)
1256 def parse_filesize(s):
1260 # The lower-case forms are of course incorrect and inofficial,
1261 # but we support those too
1299 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1301 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1305 num_str = m.group('num').replace(',', '.')
1306 mult = _UNIT_TABLE[m.group('unit')]
1307 return int(float(num_str) * mult)
1310 def month_by_name(name):
1311 """ Return the number of a month by (locale-independently) English name """
1314 return ENGLISH_MONTH_NAMES.index(name) + 1
1319 def month_by_abbreviation(abbrev):
1320 """ Return the number of a month by (locale-independently) English
1324 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1329 def fix_xml_ampersands(xml_str):
1330 """Replace all the '&' by '&' in XML"""
1332 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1337 def setproctitle(title):
1338 assert isinstance(title, compat_str)
1340 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1343 title_bytes = title.encode('utf-8')
1344 buf = ctypes.create_string_buffer(len(title_bytes))
1345 buf.value = title_bytes
1347 libc.prctl(15, buf, 0, 0, 0)
1348 except AttributeError:
1349 return # Strange libc, just skip this
1352 def remove_start(s, start):
1353 if s.startswith(start):
1354 return s[len(start):]
1358 def remove_end(s, end):
1360 return s[:-len(end)]
1364 def url_basename(url):
1365 path = compat_urlparse.urlparse(url).path
1366 return path.strip('/').split('/')[-1]
1369 class HEADRequest(compat_urllib_request.Request):
1370 def get_method(self):
1374 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1377 v = getattr(v, get_attr, None)
1383 return int(v) * invscale // scale
1388 def str_or_none(v, default=None):
1389 return default if v is None else compat_str(v)
1392 def str_to_int(int_str):
1393 """ A more relaxed version of int_or_none """
1396 int_str = re.sub(r'[,\.\+]', '', int_str)
1400 def float_or_none(v, scale=1, invscale=1, default=None):
1404 return float(v) * invscale / scale
1409 def parse_duration(s):
1410 if not isinstance(s, compat_basestring):
1418 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1419 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1421 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1424 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1425 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1427 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1429 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1434 if m.group('only_mins'):
1435 return float_or_none(m.group('only_mins'), invscale=60)
1436 if m.group('only_hours'):
1437 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1439 res += int(m.group('secs'))
1440 if m.group('mins_reversed'):
1441 res += int(m.group('mins_reversed')) * 60
1443 res += int(m.group('mins')) * 60
1444 if m.group('hours'):
1445 res += int(m.group('hours')) * 60 * 60
1446 if m.group('hours_reversed'):
1447 res += int(m.group('hours_reversed')) * 60 * 60
1449 res += int(m.group('days')) * 24 * 60 * 60
1451 res += float(m.group('ms'))
1455 def prepend_extension(filename, ext, expected_real_ext=None):
1456 name, real_ext = os.path.splitext(filename)
1458 '{0}.{1}{2}'.format(name, ext, real_ext)
1459 if not expected_real_ext or real_ext[1:] == expected_real_ext
1460 else '{0}.{1}'.format(filename, ext))
1463 def replace_extension(filename, ext, expected_real_ext=None):
1464 name, real_ext = os.path.splitext(filename)
1465 return '{0}.{1}'.format(
1466 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1470 def check_executable(exe, args=[]):
1471 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1472 args can be a list of arguments for a short output (like -version) """
1474 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1480 def get_exe_version(exe, args=['--version'],
1481 version_re=None, unrecognized='present'):
1482 """ Returns the version of the specified executable,
1483 or False if the executable is not present """
1485 out, _ = subprocess.Popen(
1486 [encodeArgument(exe)] + args,
1487 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1490 if isinstance(out, bytes): # Python 2.x
1491 out = out.decode('ascii', 'ignore')
1492 return detect_exe_version(out, version_re, unrecognized)
1495 def detect_exe_version(output, version_re=None, unrecognized='present'):
1496 assert isinstance(output, compat_str)
1497 if version_re is None:
1498 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1499 m = re.search(version_re, output)
1506 class PagedList(object):
1508 # This is only useful for tests
1509 return len(self.getslice())
1512 class OnDemandPagedList(PagedList):
1513 def __init__(self, pagefunc, pagesize):
1514 self._pagefunc = pagefunc
1515 self._pagesize = pagesize
1517 def getslice(self, start=0, end=None):
1519 for pagenum in itertools.count(start // self._pagesize):
1520 firstid = pagenum * self._pagesize
1521 nextfirstid = pagenum * self._pagesize + self._pagesize
1522 if start >= nextfirstid:
1525 page_results = list(self._pagefunc(pagenum))
1528 start % self._pagesize
1529 if firstid <= start < nextfirstid
1533 ((end - 1) % self._pagesize) + 1
1534 if (end is not None and firstid <= end <= nextfirstid)
1537 if startv != 0 or endv is not None:
1538 page_results = page_results[startv:endv]
1539 res.extend(page_results)
1541 # A little optimization - if current page is not "full", ie. does
1542 # not contain page_size videos then we can assume that this page
1543 # is the last one - there are no more ids on further pages -
1544 # i.e. no need to query again.
1545 if len(page_results) + startv < self._pagesize:
1548 # If we got the whole page, but the next page is not interesting,
1549 # break out early as well
1550 if end == nextfirstid:
1555 class InAdvancePagedList(PagedList):
1556 def __init__(self, pagefunc, pagecount, pagesize):
1557 self._pagefunc = pagefunc
1558 self._pagecount = pagecount
1559 self._pagesize = pagesize
1561 def getslice(self, start=0, end=None):
1563 start_page = start // self._pagesize
1565 self._pagecount if end is None else (end // self._pagesize + 1))
1566 skip_elems = start - start_page * self._pagesize
1567 only_more = None if end is None else end - start
1568 for pagenum in range(start_page, end_page):
1569 page = list(self._pagefunc(pagenum))
1571 page = page[skip_elems:]
1573 if only_more is not None:
1574 if len(page) < only_more:
1575 only_more -= len(page)
1577 page = page[:only_more]
1584 def uppercase_escape(s):
1585 unicode_escape = codecs.getdecoder('unicode_escape')
1587 r'\\U[0-9a-fA-F]{8}',
1588 lambda m: unicode_escape(m.group(0))[0],
1592 def lowercase_escape(s):
1593 unicode_escape = codecs.getdecoder('unicode_escape')
1595 r'\\u[0-9a-fA-F]{4}',
1596 lambda m: unicode_escape(m.group(0))[0],
1600 def escape_rfc3986(s):
1601 """Escape non-ASCII characters as suggested by RFC 3986"""
1602 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1603 s = s.encode('utf-8')
1604 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1607 def escape_url(url):
1608 """Escape URL as suggested by RFC 3986"""
1609 url_parsed = compat_urllib_parse_urlparse(url)
1610 return url_parsed._replace(
1611 path=escape_rfc3986(url_parsed.path),
1612 params=escape_rfc3986(url_parsed.params),
1613 query=escape_rfc3986(url_parsed.query),
1614 fragment=escape_rfc3986(url_parsed.fragment)
1618 struct.pack('!I', 0)
1620 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1621 def struct_pack(spec, *args):
1622 if isinstance(spec, compat_str):
1623 spec = spec.encode('ascii')
1624 return struct.pack(spec, *args)
1626 def struct_unpack(spec, *args):
1627 if isinstance(spec, compat_str):
1628 spec = spec.encode('ascii')
1629 return struct.unpack(spec, *args)
1631 struct_pack = struct.pack
1632 struct_unpack = struct.unpack
1635 def read_batch_urls(batch_fd):
1637 if not isinstance(url, compat_str):
1638 url = url.decode('utf-8', 'replace')
1639 BOM_UTF8 = '\xef\xbb\xbf'
1640 if url.startswith(BOM_UTF8):
1641 url = url[len(BOM_UTF8):]
1643 if url.startswith(('#', ';', ']')):
1647 with contextlib.closing(batch_fd) as fd:
1648 return [url for url in map(fixup, fd) if url]
1651 def urlencode_postdata(*args, **kargs):
1652 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1655 def encode_dict(d, encoding='utf-8'):
1656 return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
1660 etree_iter = xml.etree.ElementTree.Element.iter
1661 except AttributeError: # Python <=2.6
1662 etree_iter = lambda n: n.findall('.//*')
1666 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1667 def doctype(self, name, pubid, system):
1668 pass # Ignore doctypes
1670 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1671 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1672 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1673 # Fix up XML parser in Python 2.x
1674 if sys.version_info < (3, 0):
1675 for n in etree_iter(tree):
1676 if n.text is not None:
1677 if not isinstance(n.text, compat_str):
1678 n.text = n.text.decode('utf-8')
1691 def parse_age_limit(s):
1694 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1695 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1698 def strip_jsonp(code):
1700 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1703 def js_to_json(code):
1706 if v in ('true', 'false', 'null'):
1708 if v.startswith('"'):
1709 v = re.sub(r"\\'", "'", v[1:-1])
1710 elif v.startswith("'"):
1712 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1719 res = re.sub(r'''(?x)
1720 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1721 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1722 [a-zA-Z_][.a-zA-Z_0-9]*
1724 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1728 def qualities(quality_ids):
1729 """ Get a numeric quality value out of a list of possible values """
1732 return quality_ids.index(qid)
1738 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1741 def limit_length(s, length):
1742 """ Add ellipses to overly long strings """
1747 return s[:length - len(ELLIPSES)] + ELLIPSES
1751 def version_tuple(v):
1752 return tuple(int(e) for e in re.split(r'[-.]', v))
1755 def is_outdated_version(version, limit, assume_new=True):
1757 return not assume_new
1759 return version_tuple(version) < version_tuple(limit)
1761 return not assume_new
1764 def ytdl_is_updateable():
1765 """ Returns if youtube-dl can be updated with -U """
1766 from zipimport import zipimporter
1768 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1771 def args_to_str(args):
1772 # Get a short string representation for a subprocess command
1773 return ' '.join(shlex_quote(a) for a in args)
1776 def mimetype2ext(mt):
1777 _, _, res = mt.rpartition('/')
1781 'x-mp4-fragmented': 'mp4',
1786 def urlhandle_detect_ext(url_handle):
1789 getheader = lambda h: url_handle.headers[h]
1790 except AttributeError: # Python < 3
1791 getheader = url_handle.info().getheader
1793 cd = getheader('Content-Disposition')
1795 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1797 e = determine_ext(m.group('filename'), default_ext=None)
1801 return mimetype2ext(getheader('Content-Type'))
1804 def encode_data_uri(data, mime_type):
1805 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1808 def age_restricted(content_limit, age_limit):
1809 """ Returns True iff the content should be blocked """
1811 if age_limit is None: # No limit set
1813 if content_limit is None:
1814 return False # Content available for everyone
1815 return age_limit < content_limit
1818 def is_html(first_bytes):
1819 """ Detect whether a file contains HTML by examining its first bytes. """
1822 (b'\xef\xbb\xbf', 'utf-8'),
1823 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1824 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1825 (b'\xff\xfe', 'utf-16-le'),
1826 (b'\xfe\xff', 'utf-16-be'),
1828 for bom, enc in BOMS:
1829 if first_bytes.startswith(bom):
1830 s = first_bytes[len(bom):].decode(enc, 'replace')
1833 s = first_bytes.decode('utf-8', 'replace')
1835 return re.match(r'^\s*<', s)
1838 def determine_protocol(info_dict):
1839 protocol = info_dict.get('protocol')
1840 if protocol is not None:
1843 url = info_dict['url']
1844 if url.startswith('rtmp'):
1846 elif url.startswith('mms'):
1848 elif url.startswith('rtsp'):
1851 ext = determine_ext(url)
1857 return compat_urllib_parse_urlparse(url).scheme
1860 def render_table(header_row, data):
1861 """ Render a list of rows, each as a list of values """
1862 table = [header_row] + data
1863 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1864 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1865 return '\n'.join(format_str % tuple(row) for row in table)
1868 def _match_one(filter_part, dct):
1869 COMPARISON_OPERATORS = {
1877 operator_rex = re.compile(r'''(?x)\s*
1879 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1881 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1882 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1885 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1886 m = operator_rex.search(filter_part)
1888 op = COMPARISON_OPERATORS[m.group('op')]
1889 if m.group('strval') is not None:
1890 if m.group('op') not in ('=', '!='):
1892 'Operator %s does not support string values!' % m.group('op'))
1893 comparison_value = m.group('strval')
1896 comparison_value = int(m.group('intval'))
1898 comparison_value = parse_filesize(m.group('intval'))
1899 if comparison_value is None:
1900 comparison_value = parse_filesize(m.group('intval') + 'B')
1901 if comparison_value is None:
1903 'Invalid integer value %r in filter part %r' % (
1904 m.group('intval'), filter_part))
1905 actual_value = dct.get(m.group('key'))
1906 if actual_value is None:
1907 return m.group('none_inclusive')
1908 return op(actual_value, comparison_value)
1911 '': lambda v: v is not None,
1912 '!': lambda v: v is None,
1914 operator_rex = re.compile(r'''(?x)\s*
1915 (?P<op>%s)\s*(?P<key>[a-z_]+)
1917 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1918 m = operator_rex.search(filter_part)
1920 op = UNARY_OPERATORS[m.group('op')]
1921 actual_value = dct.get(m.group('key'))
1922 return op(actual_value)
1924 raise ValueError('Invalid filter part %r' % filter_part)
1927 def match_str(filter_str, dct):
1928 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1931 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1934 def match_filter_func(filter_str):
1935 def _match_func(info_dict):
1936 if match_str(filter_str, info_dict):
1939 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1940 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1944 def parse_dfxp_time_expr(time_expr):
1948 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1950 return float(mobj.group('time_offset'))
1952 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1954 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1957 def srt_subtitles_timecode(seconds):
1958 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1961 def dfxp2srt(dfxp_data):
1962 _x = functools.partial(xpath_with_ns, ns_map={
1963 'ttml': 'http://www.w3.org/ns/ttml',
1964 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1967 def parse_node(node):
1968 str_or_empty = functools.partial(str_or_none, default='')
1970 out = str_or_empty(node.text)
1973 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1974 out += '\n' + str_or_empty(child.tail)
1975 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1976 out += str_or_empty(parse_node(child))
1978 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1982 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1984 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1987 raise ValueError('Invalid dfxp/TTML subtitle')
1989 for para, index in zip(paras, itertools.count(1)):
1990 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1991 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1993 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1994 out.append('%d\n%s --> %s\n%s\n\n' % (
1996 srt_subtitles_timecode(begin_time),
1997 srt_subtitles_timecode(end_time),
2003 def cli_option(params, command_option, param):
2004 param = params.get(param)
2005 return [command_option, param] if param is not None else []
2008 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2009 param = params.get(param)
2010 assert isinstance(param, bool)
2012 return [command_option + separator + (true_value if param else false_value)]
2013 return [command_option, true_value if param else false_value]
2016 def cli_valueless_option(params, command_option, param, expected_value=True):
2017 param = params.get(param)
2018 return [command_option] if param == expected_value else []
2021 def cli_configuration_args(params, param, default=[]):
2022 ex_args = params.get(param)
2025 assert isinstance(ex_args, list)
2029 class ISO639Utils(object):
2030 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2219 def short2long(cls, code):
2220 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2221 return cls._lang_map.get(code[:2])
2224 def long2short(cls, code):
2225 """Convert language code from ISO 639-2/T to ISO 639-1"""
2226 for short_name, long_name in cls._lang_map.items():
2227 if long_name == code:
2231 class ISO3166Utils(object):
2232 # From http://data.okfn.org/data/core/country-list
2234 'AF': 'Afghanistan',
2235 'AX': 'Ã…land Islands',
2238 'AS': 'American Samoa',
2243 'AG': 'Antigua and Barbuda',
2260 'BO': 'Bolivia, Plurinational State of',
2261 'BQ': 'Bonaire, Sint Eustatius and Saba',
2262 'BA': 'Bosnia and Herzegovina',
2264 'BV': 'Bouvet Island',
2266 'IO': 'British Indian Ocean Territory',
2267 'BN': 'Brunei Darussalam',
2269 'BF': 'Burkina Faso',
2275 'KY': 'Cayman Islands',
2276 'CF': 'Central African Republic',
2280 'CX': 'Christmas Island',
2281 'CC': 'Cocos (Keeling) Islands',
2285 'CD': 'Congo, the Democratic Republic of the',
2286 'CK': 'Cook Islands',
2288 'CI': 'Côte d\'Ivoire',
2293 'CZ': 'Czech Republic',
2297 'DO': 'Dominican Republic',
2300 'SV': 'El Salvador',
2301 'GQ': 'Equatorial Guinea',
2305 'FK': 'Falkland Islands (Malvinas)',
2306 'FO': 'Faroe Islands',
2310 'GF': 'French Guiana',
2311 'PF': 'French Polynesia',
2312 'TF': 'French Southern Territories',
2327 'GW': 'Guinea-Bissau',
2330 'HM': 'Heard Island and McDonald Islands',
2331 'VA': 'Holy See (Vatican City State)',
2338 'IR': 'Iran, Islamic Republic of',
2341 'IM': 'Isle of Man',
2351 'KP': 'Korea, Democratic People\'s Republic of',
2352 'KR': 'Korea, Republic of',
2355 'LA': 'Lao People\'s Democratic Republic',
2361 'LI': 'Liechtenstein',
2365 'MK': 'Macedonia, the Former Yugoslav Republic of',
2372 'MH': 'Marshall Islands',
2378 'FM': 'Micronesia, Federated States of',
2379 'MD': 'Moldova, Republic of',
2390 'NL': 'Netherlands',
2391 'NC': 'New Caledonia',
2392 'NZ': 'New Zealand',
2397 'NF': 'Norfolk Island',
2398 'MP': 'Northern Mariana Islands',
2403 'PS': 'Palestine, State of',
2405 'PG': 'Papua New Guinea',
2408 'PH': 'Philippines',
2412 'PR': 'Puerto Rico',
2416 'RU': 'Russian Federation',
2418 'BL': 'Saint Barthélemy',
2419 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2420 'KN': 'Saint Kitts and Nevis',
2421 'LC': 'Saint Lucia',
2422 'MF': 'Saint Martin (French part)',
2423 'PM': 'Saint Pierre and Miquelon',
2424 'VC': 'Saint Vincent and the Grenadines',
2427 'ST': 'Sao Tome and Principe',
2428 'SA': 'Saudi Arabia',
2432 'SL': 'Sierra Leone',
2434 'SX': 'Sint Maarten (Dutch part)',
2437 'SB': 'Solomon Islands',
2439 'ZA': 'South Africa',
2440 'GS': 'South Georgia and the South Sandwich Islands',
2441 'SS': 'South Sudan',
2446 'SJ': 'Svalbard and Jan Mayen',
2449 'CH': 'Switzerland',
2450 'SY': 'Syrian Arab Republic',
2451 'TW': 'Taiwan, Province of China',
2453 'TZ': 'Tanzania, United Republic of',
2455 'TL': 'Timor-Leste',
2459 'TT': 'Trinidad and Tobago',
2462 'TM': 'Turkmenistan',
2463 'TC': 'Turks and Caicos Islands',
2467 'AE': 'United Arab Emirates',
2468 'GB': 'United Kingdom',
2469 'US': 'United States',
2470 'UM': 'United States Minor Outlying Islands',
2474 'VE': 'Venezuela, Bolivarian Republic of',
2476 'VG': 'Virgin Islands, British',
2477 'VI': 'Virgin Islands, U.S.',
2478 'WF': 'Wallis and Futuna',
2479 'EH': 'Western Sahara',
2486 def short2full(cls, code):
2487 """Convert an ISO 3166-2 country code to the corresponding full name"""
2488 return cls._country_map.get(code.upper())
2491 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2492 def __init__(self, proxies=None):
2493 # Set default handlers
2494 for type in ('http', 'https'):
2495 setattr(self, '%s_open' % type,
2496 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2497 meth(r, proxy, type))
2498 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2500 def proxy_open(self, req, proxy, type):
2501 req_proxy = req.headers.get('Ytdl-request-proxy')
2502 if req_proxy is not None:
2504 del req.headers['Ytdl-request-proxy']
2506 if proxy == '__noproxy__':
2507 return None # No Proxy
2508 return compat_urllib_request.ProxyHandler.proxy_open(
2509 self, req, proxy, type)