2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
42 compat_socket_create_connection,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
67 ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
72 def preferredencoding():
73 """Get preferred encoding.
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
79 pref = locale.getpreferredencoding()
87 def write_json_file(obj, fn):
88 """ Encode obj as JSON and write it to fn, atomically if possible """
90 fn = encodeFilename(fn)
91 if sys.version_info < (3, 0) and sys.platform != 'win32':
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
132 os.rename(tf.name, fn)
141 if sys.version_info >= (2, 7):
142 def find_xpath_attr(node, xpath, key, val=None):
143 """ Find the xpath xpath[@key=val] """
144 assert re.match(r'^[a-zA-Z_-]+$', key)
146 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
147 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
148 return node.find(expr)
150 def find_xpath_attr(node, xpath, key, val=None):
151 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
152 # .//node does not match if a node is a direct child of . !
153 if isinstance(xpath, compat_str):
154 xpath = xpath.encode('ascii')
156 for f in node.findall(xpath):
157 if key not in f.attrib:
159 if val is None or f.attrib.get(key) == val:
163 # On python2.6 the xml.etree.ElementTree.Element methods don't support
164 # the namespace parameter
167 def xpath_with_ns(path, ns_map):
168 components = [c.split(':') for c in path.split('/')]
172 replaced.append(c[0])
175 replaced.append('{%s}%s' % (ns_map[ns], tag))
176 return '/'.join(replaced)
179 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
180 if sys.version_info < (2, 7): # Crazy 2.6
181 xpath = xpath.encode('ascii')
185 if default is not NO_DEFAULT:
188 name = xpath if name is None else name
189 raise ExtractorError('Could not find XML element %s' % name)
195 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
196 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
197 if n is None or n == default:
200 if default is not NO_DEFAULT:
203 name = xpath if name is None else name
204 raise ExtractorError('Could not find XML element\'s text %s' % name)
210 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
211 n = find_xpath_attr(node, xpath, key)
213 if default is not NO_DEFAULT:
216 name = '%s[@%s]' % (xpath, key) if name is None else name
217 raise ExtractorError('Could not find XML attribute %s' % name)
223 def get_element_by_id(id, html):
224 """Return the content of the tag with the specified ID in the passed HTML document"""
225 return get_element_by_attribute("id", id, html)
228 def get_element_by_attribute(attribute, value, html):
229 """Return the content of the tag with the specified attribute in the passed HTML document"""
231 m = re.search(r'''(?xs)
233 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
235 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
239 ''' % (re.escape(attribute), re.escape(value)), html)
243 res = m.group('content')
245 if res.startswith('"') or res.startswith("'"):
248 return unescapeHTML(res)
251 def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'):
252 attributes = re.findall(attributes_regex, attributes_str)
255 for (attribute_name, attribute_value) in attributes:
256 attributes_dict[attribute_name] = attribute_value
257 return attributes_dict
260 def clean_html(html):
261 """Clean an HTML snippet into a readable string"""
263 if html is None: # Convenience for sanitizing descriptions etc.
267 html = html.replace('\n', ' ')
268 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
269 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
271 html = re.sub('<.*?>', '', html)
272 # Replace html entities
273 html = unescapeHTML(html)
277 def sanitize_open(filename, open_mode):
278 """Try to open the given filename, and slightly tweak it if this fails.
280 Attempts to open the given filename. If this fails, it tries to change
281 the filename slightly, step by step, until it's either able to open it
282 or it fails and raises a final exception, like the standard open()
285 It returns the tuple (stream, definitive_file_name).
289 if sys.platform == 'win32':
291 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
292 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
293 stream = open(encodeFilename(filename), open_mode)
294 return (stream, filename)
295 except (IOError, OSError) as err:
296 if err.errno in (errno.EACCES,):
299 # In case of error, try to remove win32 forbidden chars
300 alt_filename = sanitize_path(filename)
301 if alt_filename == filename:
304 # An exception here should be caught in the caller
305 stream = open(encodeFilename(alt_filename), open_mode)
306 return (stream, alt_filename)
309 def timeconvert(timestr):
310 """Convert RFC 2822 defined time string into system timestamp"""
312 timetuple = email.utils.parsedate_tz(timestr)
313 if timetuple is not None:
314 timestamp = email.utils.mktime_tz(timetuple)
318 def sanitize_filename(s, restricted=False, is_id=False):
319 """Sanitizes a string so it could be used as part of a filename.
320 If restricted is set, use a stricter subset of allowed characters.
321 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
323 def replace_insane(char):
324 if char == '?' or ord(char) < 32 or ord(char) == 127:
327 return '' if restricted else '\''
329 return '_-' if restricted else ' -'
330 elif char in '\\/|*<>':
332 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
334 if restricted and ord(char) > 127:
339 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
340 result = ''.join(map(replace_insane, s))
342 while '__' in result:
343 result = result.replace('__', '_')
344 result = result.strip('_')
345 # Common case of "Foreign band name - English song title"
346 if restricted and result.startswith('-_'):
348 if result.startswith('-'):
349 result = '_' + result[len('-'):]
350 result = result.lstrip('.')
356 def sanitize_path(s):
357 """Sanitizes and normalizes path on Windows"""
358 if sys.platform != 'win32':
360 drive_or_unc, _ = os.path.splitdrive(s)
361 if sys.version_info < (2, 7) and not drive_or_unc:
362 drive_or_unc, _ = os.path.splitunc(s)
363 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
367 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
368 for path_part in norm_path]
370 sanitized_path.insert(0, drive_or_unc + os.path.sep)
371 return os.path.join(*sanitized_path)
374 def orderedSet(iterable):
375 """ Remove all duplicates from the input iterable """
383 def _htmlentity_transform(entity):
384 """Transforms an HTML entity to a character."""
385 # Known non-numeric HTML entity
386 if entity in compat_html_entities.name2codepoint:
387 return compat_chr(compat_html_entities.name2codepoint[entity])
389 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
391 numstr = mobj.group(1)
392 if numstr.startswith('x'):
394 numstr = '0%s' % numstr
397 return compat_chr(int(numstr, base))
399 # Unknown entity in name, return its literal representation
400 return ('&%s;' % entity)
406 assert type(s) == compat_str
409 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
412 def get_subprocess_encoding():
413 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
414 # For subprocess calls, encode with locale encoding
415 # Refer to http://stackoverflow.com/a/9951851/35070
416 encoding = preferredencoding()
418 encoding = sys.getfilesystemencoding()
424 def encodeFilename(s, for_subprocess=False):
426 @param s The name of the file
429 assert type(s) == compat_str
431 # Python 3 has a Unicode API
432 if sys.version_info >= (3, 0):
435 # Pass '' directly to use Unicode APIs on Windows 2000 and up
436 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
437 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
438 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
441 return s.encode(get_subprocess_encoding(), 'ignore')
444 def decodeFilename(b, for_subprocess=False):
446 if sys.version_info >= (3, 0):
449 if not isinstance(b, bytes):
452 return b.decode(get_subprocess_encoding(), 'ignore')
455 def encodeArgument(s):
456 if not isinstance(s, compat_str):
457 # Legacy code that uses byte strings
458 # Uncomment the following line after fixing all post processors
459 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
460 s = s.decode('ascii')
461 return encodeFilename(s, True)
464 def decodeArgument(b):
465 return decodeFilename(b, True)
468 def decodeOption(optval):
471 if isinstance(optval, bytes):
472 optval = optval.decode(preferredencoding())
474 assert isinstance(optval, compat_str)
478 def formatSeconds(secs):
480 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
482 return '%d:%02d' % (secs // 60, secs % 60)
487 def make_HTTPS_handler(params, **kwargs):
488 opts_no_check_certificate = params.get('nocheckcertificate', False)
489 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
490 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
491 if opts_no_check_certificate:
492 context.check_hostname = False
493 context.verify_mode = ssl.CERT_NONE
495 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
498 # (create_default_context present but HTTPSHandler has no context=)
501 if sys.version_info < (3, 2):
502 return YoutubeDLHTTPSHandler(params, **kwargs)
504 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
505 context.verify_mode = (ssl.CERT_NONE
506 if opts_no_check_certificate
507 else ssl.CERT_REQUIRED)
508 context.set_default_verify_paths()
509 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
512 def bug_reports_message():
513 if ytdl_is_updateable():
514 update_cmd = 'type youtube-dl -U to update'
516 update_cmd = 'see https://yt-dl.org/update on how to update'
517 msg = '; please report this issue on https://yt-dl.org/bug .'
518 msg += ' Make sure you are using the latest version; %s.' % update_cmd
519 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
523 class ExtractorError(Exception):
524 """Error during info extraction."""
526 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
527 """ tb, if given, is the original traceback (so that it can be printed out).
528 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
531 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
533 if video_id is not None:
534 msg = video_id + ': ' + msg
536 msg += ' (caused by %r)' % cause
538 msg += bug_reports_message()
539 super(ExtractorError, self).__init__(msg)
542 self.exc_info = sys.exc_info() # preserve original exception
544 self.video_id = video_id
546 def format_traceback(self):
547 if self.traceback is None:
549 return ''.join(traceback.format_tb(self.traceback))
552 class UnsupportedError(ExtractorError):
553 def __init__(self, url):
554 super(UnsupportedError, self).__init__(
555 'Unsupported URL: %s' % url, expected=True)
559 class RegexNotFoundError(ExtractorError):
560 """Error when a regex didn't match"""
564 class DownloadError(Exception):
565 """Download Error exception.
567 This exception may be thrown by FileDownloader objects if they are not
568 configured to continue on errors. They will contain the appropriate
572 def __init__(self, msg, exc_info=None):
573 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
574 super(DownloadError, self).__init__(msg)
575 self.exc_info = exc_info
578 class SameFileError(Exception):
579 """Same File exception.
581 This exception will be thrown by FileDownloader objects if they detect
582 multiple files would have to be downloaded to the same file on disk.
587 class PostProcessingError(Exception):
588 """Post Processing exception.
590 This exception may be raised by PostProcessor's .run() method to
591 indicate an error in the postprocessing task.
594 def __init__(self, msg):
598 class MaxDownloadsReached(Exception):
599 """ --max-downloads limit has been reached. """
603 class UnavailableVideoError(Exception):
604 """Unavailable Format exception.
606 This exception will be thrown when a video is requested
607 in a format that is not available for that video.
612 class ContentTooShortError(Exception):
613 """Content Too Short exception.
615 This exception may be raised by FileDownloader objects when a file they
616 download is too small for what the server announced first, indicating
617 the connection was probably interrupted.
620 def __init__(self, downloaded, expected):
622 self.downloaded = downloaded
623 self.expected = expected
626 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
627 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
628 # expected HTTP responses to meet HTTP/1.0 or later (see also
629 # https://github.com/rg3/youtube-dl/issues/6727)
630 if sys.version_info < (3, 0):
631 kwargs['strict'] = True
632 hc = http_class(*args, **kwargs)
633 source_address = ydl_handler._params.get('source_address')
634 if source_address is not None:
635 sa = (source_address, 0)
636 if hasattr(hc, 'source_address'): # Python 2.7+
637 hc.source_address = sa
639 def _hc_connect(self, *args, **kwargs):
640 sock = compat_socket_create_connection(
641 (self.host, self.port), self.timeout, sa)
643 self.sock = ssl.wrap_socket(
644 sock, self.key_file, self.cert_file,
645 ssl_version=ssl.PROTOCOL_TLSv1)
648 hc.connect = functools.partial(_hc_connect, hc)
653 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
654 """Handler for HTTP requests and responses.
656 This class, when installed with an OpenerDirector, automatically adds
657 the standard headers to every HTTP request and handles gzipped and
658 deflated responses from web servers. If compression is to be avoided in
659 a particular request, the original request in the program code only has
660 to include the HTTP header "Youtubedl-No-Compression", which will be
661 removed before making the real request.
663 Part of this code was copied from:
665 http://techknack.net/python-urllib2-handlers/
667 Andrew Rowls, the author of that code, agreed to release it to the
671 def __init__(self, params, *args, **kwargs):
672 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
673 self._params = params
675 def http_open(self, req):
676 return self.do_open(functools.partial(
677 _create_http_connection, self, compat_http_client.HTTPConnection, False),
683 return zlib.decompress(data, -zlib.MAX_WBITS)
685 return zlib.decompress(data)
688 def addinfourl_wrapper(stream, headers, url, code):
689 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
690 return compat_urllib_request.addinfourl(stream, headers, url, code)
691 ret = compat_urllib_request.addinfourl(stream, headers, url)
695 def http_request(self, req):
696 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
697 # always respected by websites, some tend to give out URLs with non percent-encoded
698 # non-ASCII characters (see telemb.py, ard.py [#3412])
699 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
700 # To work around aforementioned issue we will replace request's original URL with
701 # percent-encoded one
702 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
703 # the code of this workaround has been moved here from YoutubeDL.urlopen()
704 url = req.get_full_url()
705 url_escaped = escape_url(url)
707 # Substitute URL if any change after escaping
708 if url != url_escaped:
709 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
711 url_escaped, data=req.data, headers=req.headers,
712 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
713 new_req.timeout = req.timeout
716 for h, v in std_headers.items():
717 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
718 # The dict keys are capitalized because of this bug by urllib
719 if h.capitalize() not in req.headers:
721 if 'Youtubedl-no-compression' in req.headers:
722 if 'Accept-encoding' in req.headers:
723 del req.headers['Accept-encoding']
724 del req.headers['Youtubedl-no-compression']
726 if sys.version_info < (2, 7) and '#' in req.get_full_url():
727 # Python 2.6 is brain-dead when it comes to fragments
728 req._Request__original = req._Request__original.partition('#')[0]
729 req._Request__r_type = req._Request__r_type.partition('#')[0]
733 def http_response(self, req, resp):
736 if resp.headers.get('Content-encoding', '') == 'gzip':
737 content = resp.read()
738 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
740 uncompressed = io.BytesIO(gz.read())
741 except IOError as original_ioerror:
742 # There may be junk add the end of the file
743 # See http://stackoverflow.com/q/4928560/35070 for details
744 for i in range(1, 1024):
746 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
747 uncompressed = io.BytesIO(gz.read())
752 raise original_ioerror
753 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
754 resp.msg = old_resp.msg
756 if resp.headers.get('Content-encoding', '') == 'deflate':
757 gz = io.BytesIO(self.deflate(resp.read()))
758 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
759 resp.msg = old_resp.msg
760 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
761 # https://github.com/rg3/youtube-dl/issues/6457).
762 if 300 <= resp.code < 400:
763 location = resp.headers.get('Location')
765 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
766 if sys.version_info >= (3, 0):
767 location = location.encode('iso-8859-1').decode('utf-8')
768 location_escaped = escape_url(location)
769 if location != location_escaped:
770 del resp.headers['Location']
771 resp.headers['Location'] = location_escaped
774 https_request = http_request
775 https_response = http_response
778 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
779 def __init__(self, params, https_conn_class=None, *args, **kwargs):
780 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
781 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
782 self._params = params
784 def https_open(self, req):
786 if hasattr(self, '_context'): # python > 2.6
787 kwargs['context'] = self._context
788 if hasattr(self, '_check_hostname'): # python 3.x
789 kwargs['check_hostname'] = self._check_hostname
790 return self.do_open(functools.partial(
791 _create_http_connection, self, self._https_conn_class, True),
795 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
796 def __init__(self, cookiejar=None):
797 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
799 def http_response(self, request, response):
800 # Python 2 will choke on next HTTP request in row if there are non-ASCII
801 # characters in Set-Cookie HTTP header of last response (see
802 # https://github.com/rg3/youtube-dl/issues/6769).
803 # In order to at least prevent crashing we will percent encode Set-Cookie
804 # header before HTTPCookieProcessor starts processing it.
805 # if sys.version_info < (3, 0) and response.headers:
806 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
807 # set_cookie = response.headers.get(set_cookie_header)
809 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
810 # if set_cookie != set_cookie_escaped:
811 # del response.headers[set_cookie_header]
812 # response.headers[set_cookie_header] = set_cookie_escaped
813 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
815 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
816 https_response = http_response
819 def parse_iso8601(date_str, delimiter='T', timezone=None):
820 """ Return a UNIX timestamp from the given date """
827 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
830 timezone = datetime.timedelta()
832 date_str = date_str[:-len(m.group(0))]
833 if not m.group('sign'):
834 timezone = datetime.timedelta()
836 sign = 1 if m.group('sign') == '+' else -1
837 timezone = datetime.timedelta(
838 hours=sign * int(m.group('hours')),
839 minutes=sign * int(m.group('minutes')))
840 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
841 dt = datetime.datetime.strptime(date_str, date_format) - timezone
842 return calendar.timegm(dt.timetuple())
845 def unified_strdate(date_str, day_first=True):
846 """Return a string with the date in the format YYYYMMDD"""
852 date_str = date_str.replace(',', ' ')
853 # %z (UTC offset) is only supported in python>=3.2
854 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
855 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
856 # Remove AM/PM + timezone
857 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
859 format_expressions = [
864 '%b %dst %Y %I:%M%p',
865 '%b %dnd %Y %I:%M%p',
866 '%b %dth %Y %I:%M%p',
872 '%Y-%m-%d %H:%M:%S.%f',
875 '%Y-%m-%dT%H:%M:%SZ',
876 '%Y-%m-%dT%H:%M:%S.%fZ',
877 '%Y-%m-%dT%H:%M:%S.%f0Z',
879 '%Y-%m-%dT%H:%M:%S.%f',
883 format_expressions.extend([
891 format_expressions.extend([
898 for expression in format_expressions:
900 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
903 if upload_date is None:
904 timetuple = email.utils.parsedate_tz(date_str)
906 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
910 def determine_ext(url, default_ext='unknown_video'):
913 guess = url.partition('?')[0].rpartition('.')[2]
914 if re.match(r'^[A-Za-z0-9]+$', guess):
920 def subtitles_filename(filename, sub_lang, sub_format):
921 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
924 def date_from_str(date_str):
926 Return a datetime object from a string in the format YYYYMMDD or
927 (now|today)[+-][0-9](day|week|month|year)(s)?"""
928 today = datetime.date.today()
929 if date_str in ('now', 'today'):
931 if date_str == 'yesterday':
932 return today - datetime.timedelta(days=1)
933 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
934 if match is not None:
935 sign = match.group('sign')
936 time = int(match.group('time'))
939 unit = match.group('unit')
940 # A bad aproximation?
948 delta = datetime.timedelta(**{unit: time})
950 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
953 def hyphenate_date(date_str):
955 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
956 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
957 if match is not None:
958 return '-'.join(match.groups())
963 class DateRange(object):
964 """Represents a time interval between two dates"""
966 def __init__(self, start=None, end=None):
967 """start and end must be strings in the format accepted by date"""
968 if start is not None:
969 self.start = date_from_str(start)
971 self.start = datetime.datetime.min.date()
973 self.end = date_from_str(end)
975 self.end = datetime.datetime.max.date()
976 if self.start > self.end:
977 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
981 """Returns a range that only contains the given day"""
984 def __contains__(self, date):
985 """Check if the date is in the range"""
986 if not isinstance(date, datetime.date):
987 date = date_from_str(date)
988 return self.start <= date <= self.end
991 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
995 """ Returns the platform name as a compat_str """
996 res = platform.platform()
997 if isinstance(res, bytes):
998 res = res.decode(preferredencoding())
1000 assert isinstance(res, compat_str)
1004 def _windows_write_string(s, out):
1005 """ Returns True if the string was written using special methods,
1006 False if it has yet to be written out."""
1007 # Adapted from http://stackoverflow.com/a/3259271/35070
1010 import ctypes.wintypes
1018 fileno = out.fileno()
1019 except AttributeError:
1020 # If the output stream doesn't have a fileno, it's virtual
1022 except io.UnsupportedOperation:
1023 # Some strange Windows pseudo files?
1025 if fileno not in WIN_OUTPUT_IDS:
1028 GetStdHandle = ctypes.WINFUNCTYPE(
1029 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1030 (b"GetStdHandle", ctypes.windll.kernel32))
1031 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1033 WriteConsoleW = ctypes.WINFUNCTYPE(
1034 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1035 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1036 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1037 written = ctypes.wintypes.DWORD(0)
1039 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1040 FILE_TYPE_CHAR = 0x0002
1041 FILE_TYPE_REMOTE = 0x8000
1042 GetConsoleMode = ctypes.WINFUNCTYPE(
1043 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1044 ctypes.POINTER(ctypes.wintypes.DWORD))(
1045 (b"GetConsoleMode", ctypes.windll.kernel32))
1046 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1048 def not_a_console(handle):
1049 if handle == INVALID_HANDLE_VALUE or handle is None:
1051 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1052 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1054 if not_a_console(h):
1057 def next_nonbmp_pos(s):
1059 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1060 except StopIteration:
1064 count = min(next_nonbmp_pos(s), 1024)
1066 ret = WriteConsoleW(
1067 h, s, count if count else 2, ctypes.byref(written), None)
1069 raise OSError('Failed to write string')
1070 if not count: # We just wrote a non-BMP character
1071 assert written.value == 2
1074 assert written.value > 0
1075 s = s[written.value:]
1079 def write_string(s, out=None, encoding=None):
1082 assert type(s) == compat_str
1084 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1085 if _windows_write_string(s, out):
1088 if ('b' in getattr(out, 'mode', '') or
1089 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1090 byt = s.encode(encoding or preferredencoding(), 'ignore')
1092 elif hasattr(out, 'buffer'):
1093 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1094 byt = s.encode(enc, 'ignore')
1095 out.buffer.write(byt)
1101 def bytes_to_intlist(bs):
1104 if isinstance(bs[0], int): # Python 3
1107 return [ord(c) for c in bs]
1110 def intlist_to_bytes(xs):
1113 return struct_pack('%dB' % len(xs), *xs)
1116 # Cross-platform file locking
1117 if sys.platform == 'win32':
1118 import ctypes.wintypes
1121 class OVERLAPPED(ctypes.Structure):
1123 ('Internal', ctypes.wintypes.LPVOID),
1124 ('InternalHigh', ctypes.wintypes.LPVOID),
1125 ('Offset', ctypes.wintypes.DWORD),
1126 ('OffsetHigh', ctypes.wintypes.DWORD),
1127 ('hEvent', ctypes.wintypes.HANDLE),
1130 kernel32 = ctypes.windll.kernel32
1131 LockFileEx = kernel32.LockFileEx
1132 LockFileEx.argtypes = [
1133 ctypes.wintypes.HANDLE, # hFile
1134 ctypes.wintypes.DWORD, # dwFlags
1135 ctypes.wintypes.DWORD, # dwReserved
1136 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1137 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1138 ctypes.POINTER(OVERLAPPED) # Overlapped
1140 LockFileEx.restype = ctypes.wintypes.BOOL
1141 UnlockFileEx = kernel32.UnlockFileEx
1142 UnlockFileEx.argtypes = [
1143 ctypes.wintypes.HANDLE, # hFile
1144 ctypes.wintypes.DWORD, # dwReserved
1145 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1146 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1147 ctypes.POINTER(OVERLAPPED) # Overlapped
1149 UnlockFileEx.restype = ctypes.wintypes.BOOL
1150 whole_low = 0xffffffff
1151 whole_high = 0x7fffffff
1153 def _lock_file(f, exclusive):
1154 overlapped = OVERLAPPED()
1155 overlapped.Offset = 0
1156 overlapped.OffsetHigh = 0
1157 overlapped.hEvent = 0
1158 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1159 handle = msvcrt.get_osfhandle(f.fileno())
1160 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1161 whole_low, whole_high, f._lock_file_overlapped_p):
1162 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1164 def _unlock_file(f):
1165 assert f._lock_file_overlapped_p
1166 handle = msvcrt.get_osfhandle(f.fileno())
1167 if not UnlockFileEx(handle, 0,
1168 whole_low, whole_high, f._lock_file_overlapped_p):
1169 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1174 def _lock_file(f, exclusive):
1175 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1177 def _unlock_file(f):
1178 fcntl.flock(f, fcntl.LOCK_UN)
1181 class locked_file(object):
1182 def __init__(self, filename, mode, encoding=None):
1183 assert mode in ['r', 'a', 'w']
1184 self.f = io.open(filename, mode, encoding=encoding)
1187 def __enter__(self):
1188 exclusive = self.mode != 'r'
1190 _lock_file(self.f, exclusive)
1196 def __exit__(self, etype, value, traceback):
1198 _unlock_file(self.f)
1205 def write(self, *args):
1206 return self.f.write(*args)
1208 def read(self, *args):
1209 return self.f.read(*args)
1212 def get_filesystem_encoding():
1213 encoding = sys.getfilesystemencoding()
1214 return encoding if encoding is not None else 'utf-8'
1217 def shell_quote(args):
1219 encoding = get_filesystem_encoding()
1221 if isinstance(a, bytes):
1222 # We may get a filename encoded with 'encodeFilename'
1223 a = a.decode(encoding)
1224 quoted_args.append(pipes.quote(a))
1225 return ' '.join(quoted_args)
1228 def smuggle_url(url, data):
1229 """ Pass additional data in a URL for internal use. """
1231 sdata = compat_urllib_parse.urlencode(
1232 {'__youtubedl_smuggle': json.dumps(data)})
1233 return url + '#' + sdata
1236 def unsmuggle_url(smug_url, default=None):
1237 if '#__youtubedl_smuggle' not in smug_url:
1238 return smug_url, default
1239 url, _, sdata = smug_url.rpartition('#')
1240 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1241 data = json.loads(jsond)
1245 def format_bytes(bytes):
1248 if type(bytes) is str:
1249 bytes = float(bytes)
1253 exponent = int(math.log(bytes, 1024.0))
1254 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1255 converted = float(bytes) / float(1024 ** exponent)
1256 return '%.2f%s' % (converted, suffix)
1259 def parse_filesize(s):
1263 # The lower-case forms are of course incorrect and inofficial,
1264 # but we support those too
1302 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1304 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1308 num_str = m.group('num').replace(',', '.')
1309 mult = _UNIT_TABLE[m.group('unit')]
1310 return int(float(num_str) * mult)
1313 def month_by_name(name):
1314 """ Return the number of a month by (locale-independently) English name """
1317 return ENGLISH_MONTH_NAMES.index(name) + 1
1322 def month_by_abbreviation(abbrev):
1323 """ Return the number of a month by (locale-independently) English
1327 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1332 def fix_xml_ampersands(xml_str):
1333 """Replace all the '&' by '&' in XML"""
1335 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1340 def setproctitle(title):
1341 assert isinstance(title, compat_str)
1343 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1346 title_bytes = title.encode('utf-8')
1347 buf = ctypes.create_string_buffer(len(title_bytes))
1348 buf.value = title_bytes
1350 libc.prctl(15, buf, 0, 0, 0)
1351 except AttributeError:
1352 return # Strange libc, just skip this
1355 def remove_start(s, start):
1356 if s.startswith(start):
1357 return s[len(start):]
1361 def remove_end(s, end):
1363 return s[:-len(end)]
1367 def url_basename(url):
1368 path = compat_urlparse.urlparse(url).path
1369 return path.strip('/').split('/')[-1]
1372 class HEADRequest(compat_urllib_request.Request):
1373 def get_method(self):
1377 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1380 v = getattr(v, get_attr, None)
1383 return default if v is None else (int(v) * invscale // scale)
1386 def str_or_none(v, default=None):
1387 return default if v is None else compat_str(v)
1390 def str_to_int(int_str):
1391 """ A more relaxed version of int_or_none """
1394 int_str = re.sub(r'[,\.\+]', '', int_str)
1398 def float_or_none(v, scale=1, invscale=1, default=None):
1399 return default if v is None else (float(v) * invscale / scale)
1402 def parse_duration(s):
1403 if not isinstance(s, compat_basestring):
1411 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1412 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1414 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1417 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1418 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1420 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1422 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1427 if m.group('only_mins'):
1428 return float_or_none(m.group('only_mins'), invscale=60)
1429 if m.group('only_hours'):
1430 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1432 res += int(m.group('secs'))
1433 if m.group('mins_reversed'):
1434 res += int(m.group('mins_reversed')) * 60
1436 res += int(m.group('mins')) * 60
1437 if m.group('hours'):
1438 res += int(m.group('hours')) * 60 * 60
1439 if m.group('hours_reversed'):
1440 res += int(m.group('hours_reversed')) * 60 * 60
1442 res += int(m.group('days')) * 24 * 60 * 60
1444 res += float(m.group('ms'))
1448 def prepend_extension(filename, ext, expected_real_ext=None):
1449 name, real_ext = os.path.splitext(filename)
1451 '{0}.{1}{2}'.format(name, ext, real_ext)
1452 if not expected_real_ext or real_ext[1:] == expected_real_ext
1453 else '{0}.{1}'.format(filename, ext))
1456 def replace_extension(filename, ext, expected_real_ext=None):
1457 name, real_ext = os.path.splitext(filename)
1458 return '{0}.{1}'.format(
1459 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1463 def check_executable(exe, args=[]):
1464 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1465 args can be a list of arguments for a short output (like -version) """
1467 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1473 def get_exe_version(exe, args=['--version'],
1474 version_re=None, unrecognized='present'):
1475 """ Returns the version of the specified executable,
1476 or False if the executable is not present """
1478 out, _ = subprocess.Popen(
1479 [encodeArgument(exe)] + args,
1480 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1483 if isinstance(out, bytes): # Python 2.x
1484 out = out.decode('ascii', 'ignore')
1485 return detect_exe_version(out, version_re, unrecognized)
1488 def detect_exe_version(output, version_re=None, unrecognized='present'):
1489 assert isinstance(output, compat_str)
1490 if version_re is None:
1491 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1492 m = re.search(version_re, output)
1499 class PagedList(object):
1501 # This is only useful for tests
1502 return len(self.getslice())
1505 class OnDemandPagedList(PagedList):
1506 def __init__(self, pagefunc, pagesize):
1507 self._pagefunc = pagefunc
1508 self._pagesize = pagesize
1510 def getslice(self, start=0, end=None):
1512 for pagenum in itertools.count(start // self._pagesize):
1513 firstid = pagenum * self._pagesize
1514 nextfirstid = pagenum * self._pagesize + self._pagesize
1515 if start >= nextfirstid:
1518 page_results = list(self._pagefunc(pagenum))
1521 start % self._pagesize
1522 if firstid <= start < nextfirstid
1526 ((end - 1) % self._pagesize) + 1
1527 if (end is not None and firstid <= end <= nextfirstid)
1530 if startv != 0 or endv is not None:
1531 page_results = page_results[startv:endv]
1532 res.extend(page_results)
1534 # A little optimization - if current page is not "full", ie. does
1535 # not contain page_size videos then we can assume that this page
1536 # is the last one - there are no more ids on further pages -
1537 # i.e. no need to query again.
1538 if len(page_results) + startv < self._pagesize:
1541 # If we got the whole page, but the next page is not interesting,
1542 # break out early as well
1543 if end == nextfirstid:
1548 class InAdvancePagedList(PagedList):
1549 def __init__(self, pagefunc, pagecount, pagesize):
1550 self._pagefunc = pagefunc
1551 self._pagecount = pagecount
1552 self._pagesize = pagesize
1554 def getslice(self, start=0, end=None):
1556 start_page = start // self._pagesize
1558 self._pagecount if end is None else (end // self._pagesize + 1))
1559 skip_elems = start - start_page * self._pagesize
1560 only_more = None if end is None else end - start
1561 for pagenum in range(start_page, end_page):
1562 page = list(self._pagefunc(pagenum))
1564 page = page[skip_elems:]
1566 if only_more is not None:
1567 if len(page) < only_more:
1568 only_more -= len(page)
1570 page = page[:only_more]
1577 def uppercase_escape(s):
1578 unicode_escape = codecs.getdecoder('unicode_escape')
1580 r'\\U[0-9a-fA-F]{8}',
1581 lambda m: unicode_escape(m.group(0))[0],
1585 def lowercase_escape(s):
1586 unicode_escape = codecs.getdecoder('unicode_escape')
1588 r'\\u[0-9a-fA-F]{4}',
1589 lambda m: unicode_escape(m.group(0))[0],
1593 def escape_rfc3986(s):
1594 """Escape non-ASCII characters as suggested by RFC 3986"""
1595 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1596 s = s.encode('utf-8')
1597 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1600 def escape_url(url):
1601 """Escape URL as suggested by RFC 3986"""
1602 url_parsed = compat_urllib_parse_urlparse(url)
1603 return url_parsed._replace(
1604 path=escape_rfc3986(url_parsed.path),
1605 params=escape_rfc3986(url_parsed.params),
1606 query=escape_rfc3986(url_parsed.query),
1607 fragment=escape_rfc3986(url_parsed.fragment)
1611 struct.pack('!I', 0)
1613 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1614 def struct_pack(spec, *args):
1615 if isinstance(spec, compat_str):
1616 spec = spec.encode('ascii')
1617 return struct.pack(spec, *args)
1619 def struct_unpack(spec, *args):
1620 if isinstance(spec, compat_str):
1621 spec = spec.encode('ascii')
1622 return struct.unpack(spec, *args)
1624 struct_pack = struct.pack
1625 struct_unpack = struct.unpack
1628 def read_batch_urls(batch_fd):
1630 if not isinstance(url, compat_str):
1631 url = url.decode('utf-8', 'replace')
1632 BOM_UTF8 = '\xef\xbb\xbf'
1633 if url.startswith(BOM_UTF8):
1634 url = url[len(BOM_UTF8):]
1636 if url.startswith(('#', ';', ']')):
1640 with contextlib.closing(batch_fd) as fd:
1641 return [url for url in map(fixup, fd) if url]
1644 def urlencode_postdata(*args, **kargs):
1645 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1648 def encode_dict(d, encoding='utf-8'):
1649 return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
1653 etree_iter = xml.etree.ElementTree.Element.iter
1654 except AttributeError: # Python <=2.6
1655 etree_iter = lambda n: n.findall('.//*')
1659 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1660 def doctype(self, name, pubid, system):
1661 pass # Ignore doctypes
1663 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1664 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1665 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1666 # Fix up XML parser in Python 2.x
1667 if sys.version_info < (3, 0):
1668 for n in etree_iter(tree):
1669 if n.text is not None:
1670 if not isinstance(n.text, compat_str):
1671 n.text = n.text.decode('utf-8')
1684 def parse_age_limit(s):
1687 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1688 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1691 def strip_jsonp(code):
1693 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1696 def js_to_json(code):
1699 if v in ('true', 'false', 'null'):
1701 if v.startswith('"'):
1703 if v.startswith("'"):
1705 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1712 res = re.sub(r'''(?x)
1713 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1714 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1715 [a-zA-Z_][.a-zA-Z_0-9]*
1717 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1721 def qualities(quality_ids):
1722 """ Get a numeric quality value out of a list of possible values """
1725 return quality_ids.index(qid)
1731 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1734 def limit_length(s, length):
1735 """ Add ellipses to overly long strings """
1740 return s[:length - len(ELLIPSES)] + ELLIPSES
1744 def version_tuple(v):
1745 return tuple(int(e) for e in re.split(r'[-.]', v))
1748 def is_outdated_version(version, limit, assume_new=True):
1750 return not assume_new
1752 return version_tuple(version) < version_tuple(limit)
1754 return not assume_new
1757 def ytdl_is_updateable():
1758 """ Returns if youtube-dl can be updated with -U """
1759 from zipimport import zipimporter
1761 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1764 def args_to_str(args):
1765 # Get a short string representation for a subprocess command
1766 return ' '.join(shlex_quote(a) for a in args)
1769 def mimetype2ext(mt):
1770 _, _, res = mt.rpartition('/')
1774 'x-mp4-fragmented': 'mp4',
1779 def urlhandle_detect_ext(url_handle):
1782 getheader = lambda h: url_handle.headers[h]
1783 except AttributeError: # Python < 3
1784 getheader = url_handle.info().getheader
1786 cd = getheader('Content-Disposition')
1788 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1790 e = determine_ext(m.group('filename'), default_ext=None)
1794 return mimetype2ext(getheader('Content-Type'))
1797 def age_restricted(content_limit, age_limit):
1798 """ Returns True iff the content should be blocked """
1800 if age_limit is None: # No limit set
1802 if content_limit is None:
1803 return False # Content available for everyone
1804 return age_limit < content_limit
1807 def is_html(first_bytes):
1808 """ Detect whether a file contains HTML by examining its first bytes. """
1811 (b'\xef\xbb\xbf', 'utf-8'),
1812 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1813 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1814 (b'\xff\xfe', 'utf-16-le'),
1815 (b'\xfe\xff', 'utf-16-be'),
1817 for bom, enc in BOMS:
1818 if first_bytes.startswith(bom):
1819 s = first_bytes[len(bom):].decode(enc, 'replace')
1822 s = first_bytes.decode('utf-8', 'replace')
1824 return re.match(r'^\s*<', s)
1827 def determine_protocol(info_dict):
1828 protocol = info_dict.get('protocol')
1829 if protocol is not None:
1832 url = info_dict['url']
1833 if url.startswith('rtmp'):
1835 elif url.startswith('mms'):
1837 elif url.startswith('rtsp'):
1840 ext = determine_ext(url)
1846 return compat_urllib_parse_urlparse(url).scheme
1849 def render_table(header_row, data):
1850 """ Render a list of rows, each as a list of values """
1851 table = [header_row] + data
1852 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1853 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1854 return '\n'.join(format_str % tuple(row) for row in table)
1857 def _match_one(filter_part, dct):
1858 COMPARISON_OPERATORS = {
1866 operator_rex = re.compile(r'''(?x)\s*
1868 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1870 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1871 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1874 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1875 m = operator_rex.search(filter_part)
1877 op = COMPARISON_OPERATORS[m.group('op')]
1878 if m.group('strval') is not None:
1879 if m.group('op') not in ('=', '!='):
1881 'Operator %s does not support string values!' % m.group('op'))
1882 comparison_value = m.group('strval')
1885 comparison_value = int(m.group('intval'))
1887 comparison_value = parse_filesize(m.group('intval'))
1888 if comparison_value is None:
1889 comparison_value = parse_filesize(m.group('intval') + 'B')
1890 if comparison_value is None:
1892 'Invalid integer value %r in filter part %r' % (
1893 m.group('intval'), filter_part))
1894 actual_value = dct.get(m.group('key'))
1895 if actual_value is None:
1896 return m.group('none_inclusive')
1897 return op(actual_value, comparison_value)
1900 '': lambda v: v is not None,
1901 '!': lambda v: v is None,
1903 operator_rex = re.compile(r'''(?x)\s*
1904 (?P<op>%s)\s*(?P<key>[a-z_]+)
1906 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1907 m = operator_rex.search(filter_part)
1909 op = UNARY_OPERATORS[m.group('op')]
1910 actual_value = dct.get(m.group('key'))
1911 return op(actual_value)
1913 raise ValueError('Invalid filter part %r' % filter_part)
1916 def match_str(filter_str, dct):
1917 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1920 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1923 def match_filter_func(filter_str):
1924 def _match_func(info_dict):
1925 if match_str(filter_str, info_dict):
1928 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1929 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1933 def parse_dfxp_time_expr(time_expr):
1937 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1939 return float(mobj.group('time_offset'))
1941 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1943 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1946 def srt_subtitles_timecode(seconds):
1947 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1950 def dfxp2srt(dfxp_data):
1951 _x = functools.partial(xpath_with_ns, ns_map={
1952 'ttml': 'http://www.w3.org/ns/ttml',
1953 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1956 def parse_node(node):
1957 str_or_empty = functools.partial(str_or_none, default='')
1959 out = str_or_empty(node.text)
1962 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1963 out += '\n' + str_or_empty(child.tail)
1964 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1965 out += str_or_empty(parse_node(child))
1967 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1971 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1973 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1976 raise ValueError('Invalid dfxp/TTML subtitle')
1978 for para, index in zip(paras, itertools.count(1)):
1979 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1980 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1982 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1983 out.append('%d\n%s --> %s\n%s\n\n' % (
1985 srt_subtitles_timecode(begin_time),
1986 srt_subtitles_timecode(end_time),
1992 def cli_option(params, command_option, param):
1993 param = params.get(param)
1994 return [command_option, param] if param is not None else []
1997 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
1998 param = params.get(param)
1999 assert isinstance(param, bool)
2001 return [command_option + separator + (true_value if param else false_value)]
2002 return [command_option, true_value if param else false_value]
2005 def cli_valueless_option(params, command_option, param, expected_value=True):
2006 param = params.get(param)
2007 return [command_option] if param == expected_value else []
2010 def cli_configuration_args(params, param, default=[]):
2011 ex_args = params.get(param)
2014 assert isinstance(ex_args, list)
2018 class ISO639Utils(object):
2019 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2208 def short2long(cls, code):
2209 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2210 return cls._lang_map.get(code[:2])
2213 def long2short(cls, code):
2214 """Convert language code from ISO 639-2/T to ISO 639-1"""
2215 for short_name, long_name in cls._lang_map.items():
2216 if long_name == code:
2220 class ISO3166Utils(object):
2221 # From http://data.okfn.org/data/core/country-list
2223 'AF': 'Afghanistan',
2224 'AX': 'Åland Islands',
2227 'AS': 'American Samoa',
2232 'AG': 'Antigua and Barbuda',
2249 'BO': 'Bolivia, Plurinational State of',
2250 'BQ': 'Bonaire, Sint Eustatius and Saba',
2251 'BA': 'Bosnia and Herzegovina',
2253 'BV': 'Bouvet Island',
2255 'IO': 'British Indian Ocean Territory',
2256 'BN': 'Brunei Darussalam',
2258 'BF': 'Burkina Faso',
2264 'KY': 'Cayman Islands',
2265 'CF': 'Central African Republic',
2269 'CX': 'Christmas Island',
2270 'CC': 'Cocos (Keeling) Islands',
2274 'CD': 'Congo, the Democratic Republic of the',
2275 'CK': 'Cook Islands',
2277 'CI': 'Côte d\'Ivoire',
2282 'CZ': 'Czech Republic',
2286 'DO': 'Dominican Republic',
2289 'SV': 'El Salvador',
2290 'GQ': 'Equatorial Guinea',
2294 'FK': 'Falkland Islands (Malvinas)',
2295 'FO': 'Faroe Islands',
2299 'GF': 'French Guiana',
2300 'PF': 'French Polynesia',
2301 'TF': 'French Southern Territories',
2316 'GW': 'Guinea-Bissau',
2319 'HM': 'Heard Island and McDonald Islands',
2320 'VA': 'Holy See (Vatican City State)',
2327 'IR': 'Iran, Islamic Republic of',
2330 'IM': 'Isle of Man',
2340 'KP': 'Korea, Democratic People\'s Republic of',
2341 'KR': 'Korea, Republic of',
2344 'LA': 'Lao People\'s Democratic Republic',
2350 'LI': 'Liechtenstein',
2354 'MK': 'Macedonia, the Former Yugoslav Republic of',
2361 'MH': 'Marshall Islands',
2367 'FM': 'Micronesia, Federated States of',
2368 'MD': 'Moldova, Republic of',
2379 'NL': 'Netherlands',
2380 'NC': 'New Caledonia',
2381 'NZ': 'New Zealand',
2386 'NF': 'Norfolk Island',
2387 'MP': 'Northern Mariana Islands',
2392 'PS': 'Palestine, State of',
2394 'PG': 'Papua New Guinea',
2397 'PH': 'Philippines',
2401 'PR': 'Puerto Rico',
2405 'RU': 'Russian Federation',
2407 'BL': 'Saint Barthélemy',
2408 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2409 'KN': 'Saint Kitts and Nevis',
2410 'LC': 'Saint Lucia',
2411 'MF': 'Saint Martin (French part)',
2412 'PM': 'Saint Pierre and Miquelon',
2413 'VC': 'Saint Vincent and the Grenadines',
2416 'ST': 'Sao Tome and Principe',
2417 'SA': 'Saudi Arabia',
2421 'SL': 'Sierra Leone',
2423 'SX': 'Sint Maarten (Dutch part)',
2426 'SB': 'Solomon Islands',
2428 'ZA': 'South Africa',
2429 'GS': 'South Georgia and the South Sandwich Islands',
2430 'SS': 'South Sudan',
2435 'SJ': 'Svalbard and Jan Mayen',
2438 'CH': 'Switzerland',
2439 'SY': 'Syrian Arab Republic',
2440 'TW': 'Taiwan, Province of China',
2442 'TZ': 'Tanzania, United Republic of',
2444 'TL': 'Timor-Leste',
2448 'TT': 'Trinidad and Tobago',
2451 'TM': 'Turkmenistan',
2452 'TC': 'Turks and Caicos Islands',
2456 'AE': 'United Arab Emirates',
2457 'GB': 'United Kingdom',
2458 'US': 'United States',
2459 'UM': 'United States Minor Outlying Islands',
2463 'VE': 'Venezuela, Bolivarian Republic of',
2465 'VG': 'Virgin Islands, British',
2466 'VI': 'Virgin Islands, U.S.',
2467 'WF': 'Wallis and Futuna',
2468 'EH': 'Western Sahara',
2475 def short2full(cls, code):
2476 """Convert an ISO 3166-2 country code to the corresponding full name"""
2477 return cls._country_map.get(code.upper())
2480 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2481 def __init__(self, proxies=None):
2482 # Set default handlers
2483 for type in ('http', 'https'):
2484 setattr(self, '%s_open' % type,
2485 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2486 meth(r, proxy, type))
2487 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2489 def proxy_open(self, req, proxy, type):
2490 req_proxy = req.headers.get('Ytdl-request-proxy')
2491 if req_proxy is not None:
2493 del req.headers['Ytdl-request-proxy']
2495 if proxy == '__noproxy__':
2496 return None # No Proxy
2497 return compat_urllib_request.ProxyHandler.proxy_open(
2498 self, req, proxy, type)