2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
42 compat_socket_create_connection,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
67 ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
72 def preferredencoding():
73 """Get preferred encoding.
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
79 pref = locale.getpreferredencoding()
87 def write_json_file(obj, fn):
88 """ Encode obj as JSON and write it to fn, atomically if possible """
90 fn = encodeFilename(fn)
91 if sys.version_info < (3, 0) and sys.platform != 'win32':
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
132 os.rename(tf.name, fn)
141 if sys.version_info >= (2, 7):
142 def find_xpath_attr(node, xpath, key, val=None):
143 """ Find the xpath xpath[@key=val] """
144 assert re.match(r'^[a-zA-Z_-]+$', key)
146 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
147 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
148 return node.find(expr)
150 def find_xpath_attr(node, xpath, key, val=None):
151 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
152 # .//node does not match if a node is a direct child of . !
153 if isinstance(xpath, compat_str):
154 xpath = xpath.encode('ascii')
156 for f in node.findall(xpath):
157 if key not in f.attrib:
159 if val is None or f.attrib.get(key) == val:
163 # On python2.6 the xml.etree.ElementTree.Element methods don't support
164 # the namespace parameter
167 def xpath_with_ns(path, ns_map):
168 components = [c.split(':') for c in path.split('/')]
172 replaced.append(c[0])
175 replaced.append('{%s}%s' % (ns_map[ns], tag))
176 return '/'.join(replaced)
179 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
180 if sys.version_info < (2, 7): # Crazy 2.6
181 xpath = xpath.encode('ascii')
185 if default is not NO_DEFAULT:
188 name = xpath if name is None else name
189 raise ExtractorError('Could not find XML element %s' % name)
195 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
196 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
197 if n is None or n == default:
200 if default is not NO_DEFAULT:
203 name = xpath if name is None else name
204 raise ExtractorError('Could not find XML element\'s text %s' % name)
210 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
211 n = find_xpath_attr(node, xpath, key)
213 if default is not NO_DEFAULT:
216 name = '%s[@%s]' % (xpath, key) if name is None else name
217 raise ExtractorError('Could not find XML attribute %s' % name)
223 def get_element_by_id(id, html):
224 """Return the content of the tag with the specified ID in the passed HTML document"""
225 return get_element_by_attribute("id", id, html)
228 def get_element_by_attribute(attribute, value, html):
229 """Return the content of the tag with the specified attribute in the passed HTML document"""
231 m = re.search(r'''(?xs)
233 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
235 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
239 ''' % (re.escape(attribute), re.escape(value)), html)
243 res = m.group('content')
245 if res.startswith('"') or res.startswith("'"):
248 return unescapeHTML(res)
251 def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'):
252 attributes = re.findall(attributes_regex, attributes_str)
255 attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes}
256 return attributes_dict
259 def clean_html(html):
260 """Clean an HTML snippet into a readable string"""
262 if html is None: # Convenience for sanitizing descriptions etc.
266 html = html.replace('\n', ' ')
267 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
268 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
270 html = re.sub('<.*?>', '', html)
271 # Replace html entities
272 html = unescapeHTML(html)
276 def sanitize_open(filename, open_mode):
277 """Try to open the given filename, and slightly tweak it if this fails.
279 Attempts to open the given filename. If this fails, it tries to change
280 the filename slightly, step by step, until it's either able to open it
281 or it fails and raises a final exception, like the standard open()
284 It returns the tuple (stream, definitive_file_name).
288 if sys.platform == 'win32':
290 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
291 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
292 stream = open(encodeFilename(filename), open_mode)
293 return (stream, filename)
294 except (IOError, OSError) as err:
295 if err.errno in (errno.EACCES,):
298 # In case of error, try to remove win32 forbidden chars
299 alt_filename = sanitize_path(filename)
300 if alt_filename == filename:
303 # An exception here should be caught in the caller
304 stream = open(encodeFilename(alt_filename), open_mode)
305 return (stream, alt_filename)
308 def timeconvert(timestr):
309 """Convert RFC 2822 defined time string into system timestamp"""
311 timetuple = email.utils.parsedate_tz(timestr)
312 if timetuple is not None:
313 timestamp = email.utils.mktime_tz(timetuple)
317 def sanitize_filename(s, restricted=False, is_id=False):
318 """Sanitizes a string so it could be used as part of a filename.
319 If restricted is set, use a stricter subset of allowed characters.
320 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
322 def replace_insane(char):
323 if char == '?' or ord(char) < 32 or ord(char) == 127:
326 return '' if restricted else '\''
328 return '_-' if restricted else ' -'
329 elif char in '\\/|*<>':
331 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
333 if restricted and ord(char) > 127:
338 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
339 result = ''.join(map(replace_insane, s))
341 while '__' in result:
342 result = result.replace('__', '_')
343 result = result.strip('_')
344 # Common case of "Foreign band name - English song title"
345 if restricted and result.startswith('-_'):
347 if result.startswith('-'):
348 result = '_' + result[len('-'):]
349 result = result.lstrip('.')
355 def sanitize_path(s):
356 """Sanitizes and normalizes path on Windows"""
357 if sys.platform != 'win32':
359 drive_or_unc, _ = os.path.splitdrive(s)
360 if sys.version_info < (2, 7) and not drive_or_unc:
361 drive_or_unc, _ = os.path.splitunc(s)
362 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
366 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
367 for path_part in norm_path]
369 sanitized_path.insert(0, drive_or_unc + os.path.sep)
370 return os.path.join(*sanitized_path)
373 def orderedSet(iterable):
374 """ Remove all duplicates from the input iterable """
382 def _htmlentity_transform(entity):
383 """Transforms an HTML entity to a character."""
384 # Known non-numeric HTML entity
385 if entity in compat_html_entities.name2codepoint:
386 return compat_chr(compat_html_entities.name2codepoint[entity])
388 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
390 numstr = mobj.group(1)
391 if numstr.startswith('x'):
393 numstr = '0%s' % numstr
396 return compat_chr(int(numstr, base))
398 # Unknown entity in name, return its literal representation
399 return ('&%s;' % entity)
405 assert type(s) == compat_str
408 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
411 def get_subprocess_encoding():
412 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
413 # For subprocess calls, encode with locale encoding
414 # Refer to http://stackoverflow.com/a/9951851/35070
415 encoding = preferredencoding()
417 encoding = sys.getfilesystemencoding()
423 def encodeFilename(s, for_subprocess=False):
425 @param s The name of the file
428 assert type(s) == compat_str
430 # Python 3 has a Unicode API
431 if sys.version_info >= (3, 0):
434 # Pass '' directly to use Unicode APIs on Windows 2000 and up
435 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
436 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
437 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
440 return s.encode(get_subprocess_encoding(), 'ignore')
443 def decodeFilename(b, for_subprocess=False):
445 if sys.version_info >= (3, 0):
448 if not isinstance(b, bytes):
451 return b.decode(get_subprocess_encoding(), 'ignore')
454 def encodeArgument(s):
455 if not isinstance(s, compat_str):
456 # Legacy code that uses byte strings
457 # Uncomment the following line after fixing all post processors
458 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
459 s = s.decode('ascii')
460 return encodeFilename(s, True)
463 def decodeArgument(b):
464 return decodeFilename(b, True)
467 def decodeOption(optval):
470 if isinstance(optval, bytes):
471 optval = optval.decode(preferredencoding())
473 assert isinstance(optval, compat_str)
477 def formatSeconds(secs):
479 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
481 return '%d:%02d' % (secs // 60, secs % 60)
486 def make_HTTPS_handler(params, **kwargs):
487 opts_no_check_certificate = params.get('nocheckcertificate', False)
488 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
489 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
490 if opts_no_check_certificate:
491 context.check_hostname = False
492 context.verify_mode = ssl.CERT_NONE
494 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
497 # (create_default_context present but HTTPSHandler has no context=)
500 if sys.version_info < (3, 2):
501 return YoutubeDLHTTPSHandler(params, **kwargs)
503 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
504 context.verify_mode = (ssl.CERT_NONE
505 if opts_no_check_certificate
506 else ssl.CERT_REQUIRED)
507 context.set_default_verify_paths()
508 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
511 def bug_reports_message():
512 if ytdl_is_updateable():
513 update_cmd = 'type youtube-dl -U to update'
515 update_cmd = 'see https://yt-dl.org/update on how to update'
516 msg = '; please report this issue on https://yt-dl.org/bug .'
517 msg += ' Make sure you are using the latest version; %s.' % update_cmd
518 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
522 class ExtractorError(Exception):
523 """Error during info extraction."""
525 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
526 """ tb, if given, is the original traceback (so that it can be printed out).
527 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
530 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
532 if video_id is not None:
533 msg = video_id + ': ' + msg
535 msg += ' (caused by %r)' % cause
537 msg += bug_reports_message()
538 super(ExtractorError, self).__init__(msg)
541 self.exc_info = sys.exc_info() # preserve original exception
543 self.video_id = video_id
545 def format_traceback(self):
546 if self.traceback is None:
548 return ''.join(traceback.format_tb(self.traceback))
551 class UnsupportedError(ExtractorError):
552 def __init__(self, url):
553 super(UnsupportedError, self).__init__(
554 'Unsupported URL: %s' % url, expected=True)
558 class RegexNotFoundError(ExtractorError):
559 """Error when a regex didn't match"""
563 class DownloadError(Exception):
564 """Download Error exception.
566 This exception may be thrown by FileDownloader objects if they are not
567 configured to continue on errors. They will contain the appropriate
571 def __init__(self, msg, exc_info=None):
572 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
573 super(DownloadError, self).__init__(msg)
574 self.exc_info = exc_info
577 class SameFileError(Exception):
578 """Same File exception.
580 This exception will be thrown by FileDownloader objects if they detect
581 multiple files would have to be downloaded to the same file on disk.
586 class PostProcessingError(Exception):
587 """Post Processing exception.
589 This exception may be raised by PostProcessor's .run() method to
590 indicate an error in the postprocessing task.
593 def __init__(self, msg):
597 class MaxDownloadsReached(Exception):
598 """ --max-downloads limit has been reached. """
602 class UnavailableVideoError(Exception):
603 """Unavailable Format exception.
605 This exception will be thrown when a video is requested
606 in a format that is not available for that video.
611 class ContentTooShortError(Exception):
612 """Content Too Short exception.
614 This exception may be raised by FileDownloader objects when a file they
615 download is too small for what the server announced first, indicating
616 the connection was probably interrupted.
619 def __init__(self, downloaded, expected):
621 self.downloaded = downloaded
622 self.expected = expected
625 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
626 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
627 # expected HTTP responses to meet HTTP/1.0 or later (see also
628 # https://github.com/rg3/youtube-dl/issues/6727)
629 if sys.version_info < (3, 0):
630 kwargs['strict'] = True
631 hc = http_class(*args, **kwargs)
632 source_address = ydl_handler._params.get('source_address')
633 if source_address is not None:
634 sa = (source_address, 0)
635 if hasattr(hc, 'source_address'): # Python 2.7+
636 hc.source_address = sa
638 def _hc_connect(self, *args, **kwargs):
639 sock = compat_socket_create_connection(
640 (self.host, self.port), self.timeout, sa)
642 self.sock = ssl.wrap_socket(
643 sock, self.key_file, self.cert_file,
644 ssl_version=ssl.PROTOCOL_TLSv1)
647 hc.connect = functools.partial(_hc_connect, hc)
652 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
653 """Handler for HTTP requests and responses.
655 This class, when installed with an OpenerDirector, automatically adds
656 the standard headers to every HTTP request and handles gzipped and
657 deflated responses from web servers. If compression is to be avoided in
658 a particular request, the original request in the program code only has
659 to include the HTTP header "Youtubedl-No-Compression", which will be
660 removed before making the real request.
662 Part of this code was copied from:
664 http://techknack.net/python-urllib2-handlers/
666 Andrew Rowls, the author of that code, agreed to release it to the
670 def __init__(self, params, *args, **kwargs):
671 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
672 self._params = params
674 def http_open(self, req):
675 return self.do_open(functools.partial(
676 _create_http_connection, self, compat_http_client.HTTPConnection, False),
682 return zlib.decompress(data, -zlib.MAX_WBITS)
684 return zlib.decompress(data)
687 def addinfourl_wrapper(stream, headers, url, code):
688 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
689 return compat_urllib_request.addinfourl(stream, headers, url, code)
690 ret = compat_urllib_request.addinfourl(stream, headers, url)
694 def http_request(self, req):
695 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
696 # always respected by websites, some tend to give out URLs with non percent-encoded
697 # non-ASCII characters (see telemb.py, ard.py [#3412])
698 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
699 # To work around aforementioned issue we will replace request's original URL with
700 # percent-encoded one
701 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
702 # the code of this workaround has been moved here from YoutubeDL.urlopen()
703 url = req.get_full_url()
704 url_escaped = escape_url(url)
706 # Substitute URL if any change after escaping
707 if url != url_escaped:
708 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
710 url_escaped, data=req.data, headers=req.headers,
711 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
712 new_req.timeout = req.timeout
715 for h, v in std_headers.items():
716 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
717 # The dict keys are capitalized because of this bug by urllib
718 if h.capitalize() not in req.headers:
720 if 'Youtubedl-no-compression' in req.headers:
721 if 'Accept-encoding' in req.headers:
722 del req.headers['Accept-encoding']
723 del req.headers['Youtubedl-no-compression']
725 if sys.version_info < (2, 7) and '#' in req.get_full_url():
726 # Python 2.6 is brain-dead when it comes to fragments
727 req._Request__original = req._Request__original.partition('#')[0]
728 req._Request__r_type = req._Request__r_type.partition('#')[0]
732 def http_response(self, req, resp):
735 if resp.headers.get('Content-encoding', '') == 'gzip':
736 content = resp.read()
737 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
739 uncompressed = io.BytesIO(gz.read())
740 except IOError as original_ioerror:
741 # There may be junk add the end of the file
742 # See http://stackoverflow.com/q/4928560/35070 for details
743 for i in range(1, 1024):
745 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
746 uncompressed = io.BytesIO(gz.read())
751 raise original_ioerror
752 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
753 resp.msg = old_resp.msg
755 if resp.headers.get('Content-encoding', '') == 'deflate':
756 gz = io.BytesIO(self.deflate(resp.read()))
757 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
758 resp.msg = old_resp.msg
759 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
760 # https://github.com/rg3/youtube-dl/issues/6457).
761 if 300 <= resp.code < 400:
762 location = resp.headers.get('Location')
764 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
765 if sys.version_info >= (3, 0):
766 location = location.encode('iso-8859-1').decode('utf-8')
767 location_escaped = escape_url(location)
768 if location != location_escaped:
769 del resp.headers['Location']
770 resp.headers['Location'] = location_escaped
773 https_request = http_request
774 https_response = http_response
777 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
778 def __init__(self, params, https_conn_class=None, *args, **kwargs):
779 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
780 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
781 self._params = params
783 def https_open(self, req):
785 if hasattr(self, '_context'): # python > 2.6
786 kwargs['context'] = self._context
787 if hasattr(self, '_check_hostname'): # python 3.x
788 kwargs['check_hostname'] = self._check_hostname
789 return self.do_open(functools.partial(
790 _create_http_connection, self, self._https_conn_class, True),
794 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
795 def __init__(self, cookiejar=None):
796 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
798 def http_response(self, request, response):
799 # Python 2 will choke on next HTTP request in row if there are non-ASCII
800 # characters in Set-Cookie HTTP header of last response (see
801 # https://github.com/rg3/youtube-dl/issues/6769).
802 # In order to at least prevent crashing we will percent encode Set-Cookie
803 # header before HTTPCookieProcessor starts processing it.
804 # if sys.version_info < (3, 0) and response.headers:
805 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
806 # set_cookie = response.headers.get(set_cookie_header)
808 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
809 # if set_cookie != set_cookie_escaped:
810 # del response.headers[set_cookie_header]
811 # response.headers[set_cookie_header] = set_cookie_escaped
812 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
814 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
815 https_response = http_response
818 def parse_iso8601(date_str, delimiter='T', timezone=None):
819 """ Return a UNIX timestamp from the given date """
826 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
829 timezone = datetime.timedelta()
831 date_str = date_str[:-len(m.group(0))]
832 if not m.group('sign'):
833 timezone = datetime.timedelta()
835 sign = 1 if m.group('sign') == '+' else -1
836 timezone = datetime.timedelta(
837 hours=sign * int(m.group('hours')),
838 minutes=sign * int(m.group('minutes')))
839 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
840 dt = datetime.datetime.strptime(date_str, date_format) - timezone
841 return calendar.timegm(dt.timetuple())
844 def unified_strdate(date_str, day_first=True):
845 """Return a string with the date in the format YYYYMMDD"""
851 date_str = date_str.replace(',', ' ')
852 # %z (UTC offset) is only supported in python>=3.2
853 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
854 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
855 # Remove AM/PM + timezone
856 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
858 format_expressions = [
863 '%b %dst %Y %I:%M%p',
864 '%b %dnd %Y %I:%M%p',
865 '%b %dth %Y %I:%M%p',
871 '%Y-%m-%d %H:%M:%S.%f',
874 '%Y-%m-%dT%H:%M:%SZ',
875 '%Y-%m-%dT%H:%M:%S.%fZ',
876 '%Y-%m-%dT%H:%M:%S.%f0Z',
878 '%Y-%m-%dT%H:%M:%S.%f',
882 format_expressions.extend([
890 format_expressions.extend([
897 for expression in format_expressions:
899 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
902 if upload_date is None:
903 timetuple = email.utils.parsedate_tz(date_str)
905 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
909 def determine_ext(url, default_ext='unknown_video'):
912 guess = url.partition('?')[0].rpartition('.')[2]
913 if re.match(r'^[A-Za-z0-9]+$', guess):
919 def subtitles_filename(filename, sub_lang, sub_format):
920 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
923 def date_from_str(date_str):
925 Return a datetime object from a string in the format YYYYMMDD or
926 (now|today)[+-][0-9](day|week|month|year)(s)?"""
927 today = datetime.date.today()
928 if date_str in ('now', 'today'):
930 if date_str == 'yesterday':
931 return today - datetime.timedelta(days=1)
932 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
933 if match is not None:
934 sign = match.group('sign')
935 time = int(match.group('time'))
938 unit = match.group('unit')
939 # A bad aproximation?
947 delta = datetime.timedelta(**{unit: time})
949 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
952 def hyphenate_date(date_str):
954 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
955 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
956 if match is not None:
957 return '-'.join(match.groups())
962 class DateRange(object):
963 """Represents a time interval between two dates"""
965 def __init__(self, start=None, end=None):
966 """start and end must be strings in the format accepted by date"""
967 if start is not None:
968 self.start = date_from_str(start)
970 self.start = datetime.datetime.min.date()
972 self.end = date_from_str(end)
974 self.end = datetime.datetime.max.date()
975 if self.start > self.end:
976 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
980 """Returns a range that only contains the given day"""
983 def __contains__(self, date):
984 """Check if the date is in the range"""
985 if not isinstance(date, datetime.date):
986 date = date_from_str(date)
987 return self.start <= date <= self.end
990 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
994 """ Returns the platform name as a compat_str """
995 res = platform.platform()
996 if isinstance(res, bytes):
997 res = res.decode(preferredencoding())
999 assert isinstance(res, compat_str)
1003 def _windows_write_string(s, out):
1004 """ Returns True if the string was written using special methods,
1005 False if it has yet to be written out."""
1006 # Adapted from http://stackoverflow.com/a/3259271/35070
1009 import ctypes.wintypes
1017 fileno = out.fileno()
1018 except AttributeError:
1019 # If the output stream doesn't have a fileno, it's virtual
1021 except io.UnsupportedOperation:
1022 # Some strange Windows pseudo files?
1024 if fileno not in WIN_OUTPUT_IDS:
1027 GetStdHandle = ctypes.WINFUNCTYPE(
1028 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1029 (b"GetStdHandle", ctypes.windll.kernel32))
1030 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1032 WriteConsoleW = ctypes.WINFUNCTYPE(
1033 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1034 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1035 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1036 written = ctypes.wintypes.DWORD(0)
1038 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1039 FILE_TYPE_CHAR = 0x0002
1040 FILE_TYPE_REMOTE = 0x8000
1041 GetConsoleMode = ctypes.WINFUNCTYPE(
1042 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1043 ctypes.POINTER(ctypes.wintypes.DWORD))(
1044 (b"GetConsoleMode", ctypes.windll.kernel32))
1045 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1047 def not_a_console(handle):
1048 if handle == INVALID_HANDLE_VALUE or handle is None:
1050 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1051 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1053 if not_a_console(h):
1056 def next_nonbmp_pos(s):
1058 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1059 except StopIteration:
1063 count = min(next_nonbmp_pos(s), 1024)
1065 ret = WriteConsoleW(
1066 h, s, count if count else 2, ctypes.byref(written), None)
1068 raise OSError('Failed to write string')
1069 if not count: # We just wrote a non-BMP character
1070 assert written.value == 2
1073 assert written.value > 0
1074 s = s[written.value:]
1078 def write_string(s, out=None, encoding=None):
1081 assert type(s) == compat_str
1083 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1084 if _windows_write_string(s, out):
1087 if ('b' in getattr(out, 'mode', '') or
1088 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1089 byt = s.encode(encoding or preferredencoding(), 'ignore')
1091 elif hasattr(out, 'buffer'):
1092 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1093 byt = s.encode(enc, 'ignore')
1094 out.buffer.write(byt)
1100 def bytes_to_intlist(bs):
1103 if isinstance(bs[0], int): # Python 3
1106 return [ord(c) for c in bs]
1109 def intlist_to_bytes(xs):
1112 return struct_pack('%dB' % len(xs), *xs)
1115 # Cross-platform file locking
1116 if sys.platform == 'win32':
1117 import ctypes.wintypes
1120 class OVERLAPPED(ctypes.Structure):
1122 ('Internal', ctypes.wintypes.LPVOID),
1123 ('InternalHigh', ctypes.wintypes.LPVOID),
1124 ('Offset', ctypes.wintypes.DWORD),
1125 ('OffsetHigh', ctypes.wintypes.DWORD),
1126 ('hEvent', ctypes.wintypes.HANDLE),
1129 kernel32 = ctypes.windll.kernel32
1130 LockFileEx = kernel32.LockFileEx
1131 LockFileEx.argtypes = [
1132 ctypes.wintypes.HANDLE, # hFile
1133 ctypes.wintypes.DWORD, # dwFlags
1134 ctypes.wintypes.DWORD, # dwReserved
1135 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1136 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1137 ctypes.POINTER(OVERLAPPED) # Overlapped
1139 LockFileEx.restype = ctypes.wintypes.BOOL
1140 UnlockFileEx = kernel32.UnlockFileEx
1141 UnlockFileEx.argtypes = [
1142 ctypes.wintypes.HANDLE, # hFile
1143 ctypes.wintypes.DWORD, # dwReserved
1144 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1145 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1146 ctypes.POINTER(OVERLAPPED) # Overlapped
1148 UnlockFileEx.restype = ctypes.wintypes.BOOL
1149 whole_low = 0xffffffff
1150 whole_high = 0x7fffffff
1152 def _lock_file(f, exclusive):
1153 overlapped = OVERLAPPED()
1154 overlapped.Offset = 0
1155 overlapped.OffsetHigh = 0
1156 overlapped.hEvent = 0
1157 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1158 handle = msvcrt.get_osfhandle(f.fileno())
1159 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1160 whole_low, whole_high, f._lock_file_overlapped_p):
1161 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1163 def _unlock_file(f):
1164 assert f._lock_file_overlapped_p
1165 handle = msvcrt.get_osfhandle(f.fileno())
1166 if not UnlockFileEx(handle, 0,
1167 whole_low, whole_high, f._lock_file_overlapped_p):
1168 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1173 def _lock_file(f, exclusive):
1174 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1176 def _unlock_file(f):
1177 fcntl.flock(f, fcntl.LOCK_UN)
1180 class locked_file(object):
1181 def __init__(self, filename, mode, encoding=None):
1182 assert mode in ['r', 'a', 'w']
1183 self.f = io.open(filename, mode, encoding=encoding)
1186 def __enter__(self):
1187 exclusive = self.mode != 'r'
1189 _lock_file(self.f, exclusive)
1195 def __exit__(self, etype, value, traceback):
1197 _unlock_file(self.f)
1204 def write(self, *args):
1205 return self.f.write(*args)
1207 def read(self, *args):
1208 return self.f.read(*args)
1211 def get_filesystem_encoding():
1212 encoding = sys.getfilesystemencoding()
1213 return encoding if encoding is not None else 'utf-8'
1216 def shell_quote(args):
1218 encoding = get_filesystem_encoding()
1220 if isinstance(a, bytes):
1221 # We may get a filename encoded with 'encodeFilename'
1222 a = a.decode(encoding)
1223 quoted_args.append(pipes.quote(a))
1224 return ' '.join(quoted_args)
1227 def smuggle_url(url, data):
1228 """ Pass additional data in a URL for internal use. """
1230 sdata = compat_urllib_parse.urlencode(
1231 {'__youtubedl_smuggle': json.dumps(data)})
1232 return url + '#' + sdata
1235 def unsmuggle_url(smug_url, default=None):
1236 if '#__youtubedl_smuggle' not in smug_url:
1237 return smug_url, default
1238 url, _, sdata = smug_url.rpartition('#')
1239 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1240 data = json.loads(jsond)
1244 def format_bytes(bytes):
1247 if type(bytes) is str:
1248 bytes = float(bytes)
1252 exponent = int(math.log(bytes, 1024.0))
1253 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1254 converted = float(bytes) / float(1024 ** exponent)
1255 return '%.2f%s' % (converted, suffix)
1258 def parse_filesize(s):
1262 # The lower-case forms are of course incorrect and inofficial,
1263 # but we support those too
1301 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1303 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1307 num_str = m.group('num').replace(',', '.')
1308 mult = _UNIT_TABLE[m.group('unit')]
1309 return int(float(num_str) * mult)
1312 def month_by_name(name):
1313 """ Return the number of a month by (locale-independently) English name """
1316 return ENGLISH_MONTH_NAMES.index(name) + 1
1321 def month_by_abbreviation(abbrev):
1322 """ Return the number of a month by (locale-independently) English
1326 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1331 def fix_xml_ampersands(xml_str):
1332 """Replace all the '&' by '&' in XML"""
1334 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1339 def setproctitle(title):
1340 assert isinstance(title, compat_str)
1342 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1345 title_bytes = title.encode('utf-8')
1346 buf = ctypes.create_string_buffer(len(title_bytes))
1347 buf.value = title_bytes
1349 libc.prctl(15, buf, 0, 0, 0)
1350 except AttributeError:
1351 return # Strange libc, just skip this
1354 def remove_start(s, start):
1355 if s.startswith(start):
1356 return s[len(start):]
1360 def remove_end(s, end):
1362 return s[:-len(end)]
1366 def url_basename(url):
1367 path = compat_urlparse.urlparse(url).path
1368 return path.strip('/').split('/')[-1]
1371 class HEADRequest(compat_urllib_request.Request):
1372 def get_method(self):
1376 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1379 v = getattr(v, get_attr, None)
1382 return default if v is None else (int(v) * invscale // scale)
1385 def str_or_none(v, default=None):
1386 return default if v is None else compat_str(v)
1389 def str_to_int(int_str):
1390 """ A more relaxed version of int_or_none """
1393 int_str = re.sub(r'[,\.\+]', '', int_str)
1397 def float_or_none(v, scale=1, invscale=1, default=None):
1398 return default if v is None else (float(v) * invscale / scale)
1401 def parse_duration(s):
1402 if not isinstance(s, compat_basestring):
1410 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1411 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1413 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1416 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1417 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1419 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1421 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1426 if m.group('only_mins'):
1427 return float_or_none(m.group('only_mins'), invscale=60)
1428 if m.group('only_hours'):
1429 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1431 res += int(m.group('secs'))
1432 if m.group('mins_reversed'):
1433 res += int(m.group('mins_reversed')) * 60
1435 res += int(m.group('mins')) * 60
1436 if m.group('hours'):
1437 res += int(m.group('hours')) * 60 * 60
1438 if m.group('hours_reversed'):
1439 res += int(m.group('hours_reversed')) * 60 * 60
1441 res += int(m.group('days')) * 24 * 60 * 60
1443 res += float(m.group('ms'))
1447 def prepend_extension(filename, ext, expected_real_ext=None):
1448 name, real_ext = os.path.splitext(filename)
1450 '{0}.{1}{2}'.format(name, ext, real_ext)
1451 if not expected_real_ext or real_ext[1:] == expected_real_ext
1452 else '{0}.{1}'.format(filename, ext))
1455 def replace_extension(filename, ext, expected_real_ext=None):
1456 name, real_ext = os.path.splitext(filename)
1457 return '{0}.{1}'.format(
1458 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1462 def check_executable(exe, args=[]):
1463 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1464 args can be a list of arguments for a short output (like -version) """
1466 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1472 def get_exe_version(exe, args=['--version'],
1473 version_re=None, unrecognized='present'):
1474 """ Returns the version of the specified executable,
1475 or False if the executable is not present """
1477 out, _ = subprocess.Popen(
1478 [encodeArgument(exe)] + args,
1479 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1482 if isinstance(out, bytes): # Python 2.x
1483 out = out.decode('ascii', 'ignore')
1484 return detect_exe_version(out, version_re, unrecognized)
1487 def detect_exe_version(output, version_re=None, unrecognized='present'):
1488 assert isinstance(output, compat_str)
1489 if version_re is None:
1490 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1491 m = re.search(version_re, output)
1498 class PagedList(object):
1500 # This is only useful for tests
1501 return len(self.getslice())
1504 class OnDemandPagedList(PagedList):
1505 def __init__(self, pagefunc, pagesize):
1506 self._pagefunc = pagefunc
1507 self._pagesize = pagesize
1509 def getslice(self, start=0, end=None):
1511 for pagenum in itertools.count(start // self._pagesize):
1512 firstid = pagenum * self._pagesize
1513 nextfirstid = pagenum * self._pagesize + self._pagesize
1514 if start >= nextfirstid:
1517 page_results = list(self._pagefunc(pagenum))
1520 start % self._pagesize
1521 if firstid <= start < nextfirstid
1525 ((end - 1) % self._pagesize) + 1
1526 if (end is not None and firstid <= end <= nextfirstid)
1529 if startv != 0 or endv is not None:
1530 page_results = page_results[startv:endv]
1531 res.extend(page_results)
1533 # A little optimization - if current page is not "full", ie. does
1534 # not contain page_size videos then we can assume that this page
1535 # is the last one - there are no more ids on further pages -
1536 # i.e. no need to query again.
1537 if len(page_results) + startv < self._pagesize:
1540 # If we got the whole page, but the next page is not interesting,
1541 # break out early as well
1542 if end == nextfirstid:
1547 class InAdvancePagedList(PagedList):
1548 def __init__(self, pagefunc, pagecount, pagesize):
1549 self._pagefunc = pagefunc
1550 self._pagecount = pagecount
1551 self._pagesize = pagesize
1553 def getslice(self, start=0, end=None):
1555 start_page = start // self._pagesize
1557 self._pagecount if end is None else (end // self._pagesize + 1))
1558 skip_elems = start - start_page * self._pagesize
1559 only_more = None if end is None else end - start
1560 for pagenum in range(start_page, end_page):
1561 page = list(self._pagefunc(pagenum))
1563 page = page[skip_elems:]
1565 if only_more is not None:
1566 if len(page) < only_more:
1567 only_more -= len(page)
1569 page = page[:only_more]
1576 def uppercase_escape(s):
1577 unicode_escape = codecs.getdecoder('unicode_escape')
1579 r'\\U[0-9a-fA-F]{8}',
1580 lambda m: unicode_escape(m.group(0))[0],
1584 def lowercase_escape(s):
1585 unicode_escape = codecs.getdecoder('unicode_escape')
1587 r'\\u[0-9a-fA-F]{4}',
1588 lambda m: unicode_escape(m.group(0))[0],
1592 def escape_rfc3986(s):
1593 """Escape non-ASCII characters as suggested by RFC 3986"""
1594 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1595 s = s.encode('utf-8')
1596 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1599 def escape_url(url):
1600 """Escape URL as suggested by RFC 3986"""
1601 url_parsed = compat_urllib_parse_urlparse(url)
1602 return url_parsed._replace(
1603 path=escape_rfc3986(url_parsed.path),
1604 params=escape_rfc3986(url_parsed.params),
1605 query=escape_rfc3986(url_parsed.query),
1606 fragment=escape_rfc3986(url_parsed.fragment)
1610 struct.pack('!I', 0)
1612 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1613 def struct_pack(spec, *args):
1614 if isinstance(spec, compat_str):
1615 spec = spec.encode('ascii')
1616 return struct.pack(spec, *args)
1618 def struct_unpack(spec, *args):
1619 if isinstance(spec, compat_str):
1620 spec = spec.encode('ascii')
1621 return struct.unpack(spec, *args)
1623 struct_pack = struct.pack
1624 struct_unpack = struct.unpack
1627 def read_batch_urls(batch_fd):
1629 if not isinstance(url, compat_str):
1630 url = url.decode('utf-8', 'replace')
1631 BOM_UTF8 = '\xef\xbb\xbf'
1632 if url.startswith(BOM_UTF8):
1633 url = url[len(BOM_UTF8):]
1635 if url.startswith(('#', ';', ']')):
1639 with contextlib.closing(batch_fd) as fd:
1640 return [url for url in map(fixup, fd) if url]
1643 def urlencode_postdata(*args, **kargs):
1644 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1647 def encode_dict(d, encoding='utf-8'):
1648 return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
1652 etree_iter = xml.etree.ElementTree.Element.iter
1653 except AttributeError: # Python <=2.6
1654 etree_iter = lambda n: n.findall('.//*')
1658 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1659 def doctype(self, name, pubid, system):
1660 pass # Ignore doctypes
1662 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1663 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1664 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1665 # Fix up XML parser in Python 2.x
1666 if sys.version_info < (3, 0):
1667 for n in etree_iter(tree):
1668 if n.text is not None:
1669 if not isinstance(n.text, compat_str):
1670 n.text = n.text.decode('utf-8')
1683 def parse_age_limit(s):
1686 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1687 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1690 def strip_jsonp(code):
1692 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1695 def js_to_json(code):
1698 if v in ('true', 'false', 'null'):
1700 if v.startswith('"'):
1702 if v.startswith("'"):
1704 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1711 res = re.sub(r'''(?x)
1712 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1713 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1714 [a-zA-Z_][.a-zA-Z_0-9]*
1716 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1720 def qualities(quality_ids):
1721 """ Get a numeric quality value out of a list of possible values """
1724 return quality_ids.index(qid)
1730 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1733 def limit_length(s, length):
1734 """ Add ellipses to overly long strings """
1739 return s[:length - len(ELLIPSES)] + ELLIPSES
1743 def version_tuple(v):
1744 return tuple(int(e) for e in re.split(r'[-.]', v))
1747 def is_outdated_version(version, limit, assume_new=True):
1749 return not assume_new
1751 return version_tuple(version) < version_tuple(limit)
1753 return not assume_new
1756 def ytdl_is_updateable():
1757 """ Returns if youtube-dl can be updated with -U """
1758 from zipimport import zipimporter
1760 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1763 def args_to_str(args):
1764 # Get a short string representation for a subprocess command
1765 return ' '.join(shlex_quote(a) for a in args)
1768 def mimetype2ext(mt):
1769 _, _, res = mt.rpartition('/')
1773 'x-mp4-fragmented': 'mp4',
1778 def urlhandle_detect_ext(url_handle):
1781 getheader = lambda h: url_handle.headers[h]
1782 except AttributeError: # Python < 3
1783 getheader = url_handle.info().getheader
1785 cd = getheader('Content-Disposition')
1787 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1789 e = determine_ext(m.group('filename'), default_ext=None)
1793 return mimetype2ext(getheader('Content-Type'))
1796 def age_restricted(content_limit, age_limit):
1797 """ Returns True iff the content should be blocked """
1799 if age_limit is None: # No limit set
1801 if content_limit is None:
1802 return False # Content available for everyone
1803 return age_limit < content_limit
1806 def is_html(first_bytes):
1807 """ Detect whether a file contains HTML by examining its first bytes. """
1810 (b'\xef\xbb\xbf', 'utf-8'),
1811 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1812 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1813 (b'\xff\xfe', 'utf-16-le'),
1814 (b'\xfe\xff', 'utf-16-be'),
1816 for bom, enc in BOMS:
1817 if first_bytes.startswith(bom):
1818 s = first_bytes[len(bom):].decode(enc, 'replace')
1821 s = first_bytes.decode('utf-8', 'replace')
1823 return re.match(r'^\s*<', s)
1826 def determine_protocol(info_dict):
1827 protocol = info_dict.get('protocol')
1828 if protocol is not None:
1831 url = info_dict['url']
1832 if url.startswith('rtmp'):
1834 elif url.startswith('mms'):
1836 elif url.startswith('rtsp'):
1839 ext = determine_ext(url)
1845 return compat_urllib_parse_urlparse(url).scheme
1848 def render_table(header_row, data):
1849 """ Render a list of rows, each as a list of values """
1850 table = [header_row] + data
1851 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1852 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1853 return '\n'.join(format_str % tuple(row) for row in table)
1856 def _match_one(filter_part, dct):
1857 COMPARISON_OPERATORS = {
1865 operator_rex = re.compile(r'''(?x)\s*
1867 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1869 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1870 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1873 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1874 m = operator_rex.search(filter_part)
1876 op = COMPARISON_OPERATORS[m.group('op')]
1877 if m.group('strval') is not None:
1878 if m.group('op') not in ('=', '!='):
1880 'Operator %s does not support string values!' % m.group('op'))
1881 comparison_value = m.group('strval')
1884 comparison_value = int(m.group('intval'))
1886 comparison_value = parse_filesize(m.group('intval'))
1887 if comparison_value is None:
1888 comparison_value = parse_filesize(m.group('intval') + 'B')
1889 if comparison_value is None:
1891 'Invalid integer value %r in filter part %r' % (
1892 m.group('intval'), filter_part))
1893 actual_value = dct.get(m.group('key'))
1894 if actual_value is None:
1895 return m.group('none_inclusive')
1896 return op(actual_value, comparison_value)
1899 '': lambda v: v is not None,
1900 '!': lambda v: v is None,
1902 operator_rex = re.compile(r'''(?x)\s*
1903 (?P<op>%s)\s*(?P<key>[a-z_]+)
1905 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1906 m = operator_rex.search(filter_part)
1908 op = UNARY_OPERATORS[m.group('op')]
1909 actual_value = dct.get(m.group('key'))
1910 return op(actual_value)
1912 raise ValueError('Invalid filter part %r' % filter_part)
1915 def match_str(filter_str, dct):
1916 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1919 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1922 def match_filter_func(filter_str):
1923 def _match_func(info_dict):
1924 if match_str(filter_str, info_dict):
1927 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1928 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1932 def parse_dfxp_time_expr(time_expr):
1936 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1938 return float(mobj.group('time_offset'))
1940 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1942 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1945 def srt_subtitles_timecode(seconds):
1946 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1949 def dfxp2srt(dfxp_data):
1950 _x = functools.partial(xpath_with_ns, ns_map={
1951 'ttml': 'http://www.w3.org/ns/ttml',
1952 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1955 def parse_node(node):
1956 str_or_empty = functools.partial(str_or_none, default='')
1958 out = str_or_empty(node.text)
1961 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1962 out += '\n' + str_or_empty(child.tail)
1963 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1964 out += str_or_empty(parse_node(child))
1966 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1970 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1972 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1975 raise ValueError('Invalid dfxp/TTML subtitle')
1977 for para, index in zip(paras, itertools.count(1)):
1978 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1979 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1981 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1982 out.append('%d\n%s --> %s\n%s\n\n' % (
1984 srt_subtitles_timecode(begin_time),
1985 srt_subtitles_timecode(end_time),
1991 def cli_option(params, command_option, param):
1992 param = params.get(param)
1993 return [command_option, param] if param is not None else []
1996 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
1997 param = params.get(param)
1998 assert isinstance(param, bool)
2000 return [command_option + separator + (true_value if param else false_value)]
2001 return [command_option, true_value if param else false_value]
2004 def cli_valueless_option(params, command_option, param, expected_value=True):
2005 param = params.get(param)
2006 return [command_option] if param == expected_value else []
2009 def cli_configuration_args(params, param, default=[]):
2010 ex_args = params.get(param)
2013 assert isinstance(ex_args, list)
2017 class ISO639Utils(object):
2018 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2207 def short2long(cls, code):
2208 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2209 return cls._lang_map.get(code[:2])
2212 def long2short(cls, code):
2213 """Convert language code from ISO 639-2/T to ISO 639-1"""
2214 for short_name, long_name in cls._lang_map.items():
2215 if long_name == code:
2219 class ISO3166Utils(object):
2220 # From http://data.okfn.org/data/core/country-list
2222 'AF': 'Afghanistan',
2223 'AX': 'Ã…land Islands',
2226 'AS': 'American Samoa',
2231 'AG': 'Antigua and Barbuda',
2248 'BO': 'Bolivia, Plurinational State of',
2249 'BQ': 'Bonaire, Sint Eustatius and Saba',
2250 'BA': 'Bosnia and Herzegovina',
2252 'BV': 'Bouvet Island',
2254 'IO': 'British Indian Ocean Territory',
2255 'BN': 'Brunei Darussalam',
2257 'BF': 'Burkina Faso',
2263 'KY': 'Cayman Islands',
2264 'CF': 'Central African Republic',
2268 'CX': 'Christmas Island',
2269 'CC': 'Cocos (Keeling) Islands',
2273 'CD': 'Congo, the Democratic Republic of the',
2274 'CK': 'Cook Islands',
2276 'CI': 'Côte d\'Ivoire',
2281 'CZ': 'Czech Republic',
2285 'DO': 'Dominican Republic',
2288 'SV': 'El Salvador',
2289 'GQ': 'Equatorial Guinea',
2293 'FK': 'Falkland Islands (Malvinas)',
2294 'FO': 'Faroe Islands',
2298 'GF': 'French Guiana',
2299 'PF': 'French Polynesia',
2300 'TF': 'French Southern Territories',
2315 'GW': 'Guinea-Bissau',
2318 'HM': 'Heard Island and McDonald Islands',
2319 'VA': 'Holy See (Vatican City State)',
2326 'IR': 'Iran, Islamic Republic of',
2329 'IM': 'Isle of Man',
2339 'KP': 'Korea, Democratic People\'s Republic of',
2340 'KR': 'Korea, Republic of',
2343 'LA': 'Lao People\'s Democratic Republic',
2349 'LI': 'Liechtenstein',
2353 'MK': 'Macedonia, the Former Yugoslav Republic of',
2360 'MH': 'Marshall Islands',
2366 'FM': 'Micronesia, Federated States of',
2367 'MD': 'Moldova, Republic of',
2378 'NL': 'Netherlands',
2379 'NC': 'New Caledonia',
2380 'NZ': 'New Zealand',
2385 'NF': 'Norfolk Island',
2386 'MP': 'Northern Mariana Islands',
2391 'PS': 'Palestine, State of',
2393 'PG': 'Papua New Guinea',
2396 'PH': 'Philippines',
2400 'PR': 'Puerto Rico',
2404 'RU': 'Russian Federation',
2406 'BL': 'Saint Barthélemy',
2407 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2408 'KN': 'Saint Kitts and Nevis',
2409 'LC': 'Saint Lucia',
2410 'MF': 'Saint Martin (French part)',
2411 'PM': 'Saint Pierre and Miquelon',
2412 'VC': 'Saint Vincent and the Grenadines',
2415 'ST': 'Sao Tome and Principe',
2416 'SA': 'Saudi Arabia',
2420 'SL': 'Sierra Leone',
2422 'SX': 'Sint Maarten (Dutch part)',
2425 'SB': 'Solomon Islands',
2427 'ZA': 'South Africa',
2428 'GS': 'South Georgia and the South Sandwich Islands',
2429 'SS': 'South Sudan',
2434 'SJ': 'Svalbard and Jan Mayen',
2437 'CH': 'Switzerland',
2438 'SY': 'Syrian Arab Republic',
2439 'TW': 'Taiwan, Province of China',
2441 'TZ': 'Tanzania, United Republic of',
2443 'TL': 'Timor-Leste',
2447 'TT': 'Trinidad and Tobago',
2450 'TM': 'Turkmenistan',
2451 'TC': 'Turks and Caicos Islands',
2455 'AE': 'United Arab Emirates',
2456 'GB': 'United Kingdom',
2457 'US': 'United States',
2458 'UM': 'United States Minor Outlying Islands',
2462 'VE': 'Venezuela, Bolivarian Republic of',
2464 'VG': 'Virgin Islands, British',
2465 'VI': 'Virgin Islands, U.S.',
2466 'WF': 'Wallis and Futuna',
2467 'EH': 'Western Sahara',
2474 def short2full(cls, code):
2475 """Convert an ISO 3166-2 country code to the corresponding full name"""
2476 return cls._country_map.get(code.upper())
2479 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2480 def __init__(self, proxies=None):
2481 # Set default handlers
2482 for type in ('http', 'https'):
2483 setattr(self, '%s_open' % type,
2484 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2485 meth(r, proxy, type))
2486 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2488 def proxy_open(self, req, proxy, type):
2489 req_proxy = req.headers.get('Ytdl-request-proxy')
2490 if req_proxy is not None:
2492 del req.headers['Ytdl-request-proxy']
2494 if proxy == '__noproxy__':
2495 return None # No Proxy
2496 return compat_urllib_request.ProxyHandler.proxy_open(
2497 self, req, proxy, type)