2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
41 compat_etree_fromstring,
46 compat_socket_create_connection,
50 compat_urllib_parse_urlparse,
51 compat_urllib_request,
58 # This is not clearly defined otherwise
59 compiled_regex_type = type(re.compile(''))
62 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
63 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
64 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
65 'Accept-Encoding': 'gzip, deflate',
66 'Accept-Language': 'en-us,en;q=0.5',
72 ENGLISH_MONTH_NAMES = [
73 'January', 'February', 'March', 'April', 'May', 'June',
74 'July', 'August', 'September', 'October', 'November', 'December']
77 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
78 'flv', 'f4v', 'f4a', 'f4b',
79 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
89 'f4f', 'f4m', 'm3u8', 'smil')
92 def preferredencoding():
93 """Get preferred encoding.
95 Returns the best encoding scheme for the system, based on
96 locale.getpreferredencoding() and some further tweaks.
99 pref = locale.getpreferredencoding()
107 def write_json_file(obj, fn):
108 """ Encode obj as JSON and write it to fn, atomically if possible """
110 fn = encodeFilename(fn)
111 if sys.version_info < (3, 0) and sys.platform != 'win32':
112 encoding = get_filesystem_encoding()
113 # os.path.basename returns a bytes object, but NamedTemporaryFile
114 # will fail if the filename contains non ascii characters unless we
115 # use a unicode object
116 path_basename = lambda f: os.path.basename(fn).decode(encoding)
117 # the same for os.path.dirname
118 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
120 path_basename = os.path.basename
121 path_dirname = os.path.dirname
125 'prefix': path_basename(fn) + '.',
126 'dir': path_dirname(fn),
130 # In Python 2.x, json.dump expects a bytestream.
131 # In Python 3.x, it writes to a character stream
132 if sys.version_info < (3, 0):
140 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
145 if sys.platform == 'win32':
146 # Need to remove existing file on Windows, else os.rename raises
147 # WindowsError or FileExistsError.
152 os.rename(tf.name, fn)
161 if sys.version_info >= (2, 7):
162 def find_xpath_attr(node, xpath, key, val=None):
163 """ Find the xpath xpath[@key=val] """
164 assert re.match(r'^[a-zA-Z_-]+$', key)
165 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
166 return node.find(expr)
168 def find_xpath_attr(node, xpath, key, val=None):
169 for f in node.findall(compat_xpath(xpath)):
170 if key not in f.attrib:
172 if val is None or f.attrib.get(key) == val:
176 # On python2.6 the xml.etree.ElementTree.Element methods don't support
177 # the namespace parameter
180 def xpath_with_ns(path, ns_map):
181 components = [c.split(':') for c in path.split('/')]
185 replaced.append(c[0])
188 replaced.append('{%s}%s' % (ns_map[ns], tag))
189 return '/'.join(replaced)
192 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
193 def _find_xpath(xpath):
194 return node.find(compat_xpath(xpath))
196 if isinstance(xpath, (str, compat_str)):
197 n = _find_xpath(xpath)
205 if default is not NO_DEFAULT:
208 name = xpath if name is None else name
209 raise ExtractorError('Could not find XML element %s' % name)
215 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
216 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
217 if n is None or n == default:
220 if default is not NO_DEFAULT:
223 name = xpath if name is None else name
224 raise ExtractorError('Could not find XML element\'s text %s' % name)
230 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
231 n = find_xpath_attr(node, xpath, key)
233 if default is not NO_DEFAULT:
236 name = '%s[@%s]' % (xpath, key) if name is None else name
237 raise ExtractorError('Could not find XML attribute %s' % name)
243 def get_element_by_id(id, html):
244 """Return the content of the tag with the specified ID in the passed HTML document"""
245 return get_element_by_attribute('id', id, html)
248 def get_element_by_attribute(attribute, value, html):
249 """Return the content of the tag with the specified attribute in the passed HTML document"""
251 m = re.search(r'''(?xs)
253 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
255 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
259 ''' % (re.escape(attribute), re.escape(value)), html)
263 res = m.group('content')
265 if res.startswith('"') or res.startswith("'"):
268 return unescapeHTML(res)
271 class HTMLAttributeParser(compat_HTMLParser):
272 """Trivial HTML parser to gather the attributes for a single element"""
275 compat_HTMLParser.__init__(self)
277 def handle_starttag(self, tag, attrs):
278 self.attrs = dict(attrs)
281 def extract_attributes(html_element):
282 """Given a string for an HTML element such as
284 a="foo" B="bar" c="&98;az" d=boz
285 empty= noval entity="&"
288 Decode and return a dictionary of attributes.
290 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
291 'empty': '', 'noval': None, 'entity': '&',
292 'sq': '"', 'dq': '\''
294 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
295 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
297 parser = HTMLAttributeParser()
298 parser.feed(html_element)
303 def clean_html(html):
304 """Clean an HTML snippet into a readable string"""
306 if html is None: # Convenience for sanitizing descriptions etc.
310 html = html.replace('\n', ' ')
311 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
312 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
314 html = re.sub('<.*?>', '', html)
315 # Replace html entities
316 html = unescapeHTML(html)
320 def sanitize_open(filename, open_mode):
321 """Try to open the given filename, and slightly tweak it if this fails.
323 Attempts to open the given filename. If this fails, it tries to change
324 the filename slightly, step by step, until it's either able to open it
325 or it fails and raises a final exception, like the standard open()
328 It returns the tuple (stream, definitive_file_name).
332 if sys.platform == 'win32':
334 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
335 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
336 stream = open(encodeFilename(filename), open_mode)
337 return (stream, filename)
338 except (IOError, OSError) as err:
339 if err.errno in (errno.EACCES,):
342 # In case of error, try to remove win32 forbidden chars
343 alt_filename = sanitize_path(filename)
344 if alt_filename == filename:
347 # An exception here should be caught in the caller
348 stream = open(encodeFilename(alt_filename), open_mode)
349 return (stream, alt_filename)
352 def timeconvert(timestr):
353 """Convert RFC 2822 defined time string into system timestamp"""
355 timetuple = email.utils.parsedate_tz(timestr)
356 if timetuple is not None:
357 timestamp = email.utils.mktime_tz(timetuple)
361 def sanitize_filename(s, restricted=False, is_id=False):
362 """Sanitizes a string so it could be used as part of a filename.
363 If restricted is set, use a stricter subset of allowed characters.
364 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
366 def replace_insane(char):
367 if char == '?' or ord(char) < 32 or ord(char) == 127:
370 return '' if restricted else '\''
372 return '_-' if restricted else ' -'
373 elif char in '\\/|*<>':
375 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
377 if restricted and ord(char) > 127:
382 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
383 result = ''.join(map(replace_insane, s))
385 while '__' in result:
386 result = result.replace('__', '_')
387 result = result.strip('_')
388 # Common case of "Foreign band name - English song title"
389 if restricted and result.startswith('-_'):
391 if result.startswith('-'):
392 result = '_' + result[len('-'):]
393 result = result.lstrip('.')
399 def sanitize_path(s):
400 """Sanitizes and normalizes path on Windows"""
401 if sys.platform != 'win32':
403 drive_or_unc, _ = os.path.splitdrive(s)
404 if sys.version_info < (2, 7) and not drive_or_unc:
405 drive_or_unc, _ = os.path.splitunc(s)
406 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
410 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
411 for path_part in norm_path]
413 sanitized_path.insert(0, drive_or_unc + os.path.sep)
414 return os.path.join(*sanitized_path)
417 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
418 # unwanted failures due to missing protocol
419 def sanitized_Request(url, *args, **kwargs):
420 return compat_urllib_request.Request(
421 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
424 def orderedSet(iterable):
425 """ Remove all duplicates from the input iterable """
433 def _htmlentity_transform(entity):
434 """Transforms an HTML entity to a character."""
435 # Known non-numeric HTML entity
436 if entity in compat_html_entities.name2codepoint:
437 return compat_chr(compat_html_entities.name2codepoint[entity])
439 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
441 numstr = mobj.group(1)
442 if numstr.startswith('x'):
444 numstr = '0%s' % numstr
447 # See https://github.com/rg3/youtube-dl/issues/7518
449 return compat_chr(int(numstr, base))
453 # Unknown entity in name, return its literal representation
454 return '&%s;' % entity
460 assert type(s) == compat_str
463 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
466 def get_subprocess_encoding():
467 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
468 # For subprocess calls, encode with locale encoding
469 # Refer to http://stackoverflow.com/a/9951851/35070
470 encoding = preferredencoding()
472 encoding = sys.getfilesystemencoding()
478 def encodeFilename(s, for_subprocess=False):
480 @param s The name of the file
483 assert type(s) == compat_str
485 # Python 3 has a Unicode API
486 if sys.version_info >= (3, 0):
489 # Pass '' directly to use Unicode APIs on Windows 2000 and up
490 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
491 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
492 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
495 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
496 if sys.platform.startswith('java'):
499 return s.encode(get_subprocess_encoding(), 'ignore')
502 def decodeFilename(b, for_subprocess=False):
504 if sys.version_info >= (3, 0):
507 if not isinstance(b, bytes):
510 return b.decode(get_subprocess_encoding(), 'ignore')
513 def encodeArgument(s):
514 if not isinstance(s, compat_str):
515 # Legacy code that uses byte strings
516 # Uncomment the following line after fixing all post processors
517 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
518 s = s.decode('ascii')
519 return encodeFilename(s, True)
522 def decodeArgument(b):
523 return decodeFilename(b, True)
526 def decodeOption(optval):
529 if isinstance(optval, bytes):
530 optval = optval.decode(preferredencoding())
532 assert isinstance(optval, compat_str)
536 def formatSeconds(secs):
538 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
540 return '%d:%02d' % (secs // 60, secs % 60)
545 def make_HTTPS_handler(params, **kwargs):
546 opts_no_check_certificate = params.get('nocheckcertificate', False)
547 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
548 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
549 if opts_no_check_certificate:
550 context.check_hostname = False
551 context.verify_mode = ssl.CERT_NONE
553 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
556 # (create_default_context present but HTTPSHandler has no context=)
559 if sys.version_info < (3, 2):
560 return YoutubeDLHTTPSHandler(params, **kwargs)
562 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
563 context.verify_mode = (ssl.CERT_NONE
564 if opts_no_check_certificate
565 else ssl.CERT_REQUIRED)
566 context.set_default_verify_paths()
567 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
570 def bug_reports_message():
571 if ytdl_is_updateable():
572 update_cmd = 'type youtube-dl -U to update'
574 update_cmd = 'see https://yt-dl.org/update on how to update'
575 msg = '; please report this issue on https://yt-dl.org/bug .'
576 msg += ' Make sure you are using the latest version; %s.' % update_cmd
577 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
581 class ExtractorError(Exception):
582 """Error during info extraction."""
584 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
585 """ tb, if given, is the original traceback (so that it can be printed out).
586 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
589 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
591 if video_id is not None:
592 msg = video_id + ': ' + msg
594 msg += ' (caused by %r)' % cause
596 msg += bug_reports_message()
597 super(ExtractorError, self).__init__(msg)
600 self.exc_info = sys.exc_info() # preserve original exception
602 self.video_id = video_id
604 def format_traceback(self):
605 if self.traceback is None:
607 return ''.join(traceback.format_tb(self.traceback))
610 class UnsupportedError(ExtractorError):
611 def __init__(self, url):
612 super(UnsupportedError, self).__init__(
613 'Unsupported URL: %s' % url, expected=True)
617 class RegexNotFoundError(ExtractorError):
618 """Error when a regex didn't match"""
622 class DownloadError(Exception):
623 """Download Error exception.
625 This exception may be thrown by FileDownloader objects if they are not
626 configured to continue on errors. They will contain the appropriate
630 def __init__(self, msg, exc_info=None):
631 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
632 super(DownloadError, self).__init__(msg)
633 self.exc_info = exc_info
636 class SameFileError(Exception):
637 """Same File exception.
639 This exception will be thrown by FileDownloader objects if they detect
640 multiple files would have to be downloaded to the same file on disk.
645 class PostProcessingError(Exception):
646 """Post Processing exception.
648 This exception may be raised by PostProcessor's .run() method to
649 indicate an error in the postprocessing task.
652 def __init__(self, msg):
656 class MaxDownloadsReached(Exception):
657 """ --max-downloads limit has been reached. """
661 class UnavailableVideoError(Exception):
662 """Unavailable Format exception.
664 This exception will be thrown when a video is requested
665 in a format that is not available for that video.
670 class ContentTooShortError(Exception):
671 """Content Too Short exception.
673 This exception may be raised by FileDownloader objects when a file they
674 download is too small for what the server announced first, indicating
675 the connection was probably interrupted.
678 def __init__(self, downloaded, expected):
680 self.downloaded = downloaded
681 self.expected = expected
684 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
685 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
686 # expected HTTP responses to meet HTTP/1.0 or later (see also
687 # https://github.com/rg3/youtube-dl/issues/6727)
688 if sys.version_info < (3, 0):
689 kwargs[b'strict'] = True
690 hc = http_class(*args, **kwargs)
691 source_address = ydl_handler._params.get('source_address')
692 if source_address is not None:
693 sa = (source_address, 0)
694 if hasattr(hc, 'source_address'): # Python 2.7+
695 hc.source_address = sa
697 def _hc_connect(self, *args, **kwargs):
698 sock = compat_socket_create_connection(
699 (self.host, self.port), self.timeout, sa)
701 self.sock = ssl.wrap_socket(
702 sock, self.key_file, self.cert_file,
703 ssl_version=ssl.PROTOCOL_TLSv1)
706 hc.connect = functools.partial(_hc_connect, hc)
711 def handle_youtubedl_headers(headers):
712 filtered_headers = headers
714 if 'Youtubedl-no-compression' in filtered_headers:
715 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
716 del filtered_headers['Youtubedl-no-compression']
718 return filtered_headers
721 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
722 """Handler for HTTP requests and responses.
724 This class, when installed with an OpenerDirector, automatically adds
725 the standard headers to every HTTP request and handles gzipped and
726 deflated responses from web servers. If compression is to be avoided in
727 a particular request, the original request in the program code only has
728 to include the HTTP header "Youtubedl-no-compression", which will be
729 removed before making the real request.
731 Part of this code was copied from:
733 http://techknack.net/python-urllib2-handlers/
735 Andrew Rowls, the author of that code, agreed to release it to the
739 def __init__(self, params, *args, **kwargs):
740 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
741 self._params = params
743 def http_open(self, req):
744 return self.do_open(functools.partial(
745 _create_http_connection, self, compat_http_client.HTTPConnection, False),
751 return zlib.decompress(data, -zlib.MAX_WBITS)
753 return zlib.decompress(data)
756 def addinfourl_wrapper(stream, headers, url, code):
757 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
758 return compat_urllib_request.addinfourl(stream, headers, url, code)
759 ret = compat_urllib_request.addinfourl(stream, headers, url)
763 def http_request(self, req):
764 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
765 # always respected by websites, some tend to give out URLs with non percent-encoded
766 # non-ASCII characters (see telemb.py, ard.py [#3412])
767 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
768 # To work around aforementioned issue we will replace request's original URL with
769 # percent-encoded one
770 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
771 # the code of this workaround has been moved here from YoutubeDL.urlopen()
772 url = req.get_full_url()
773 url_escaped = escape_url(url)
775 # Substitute URL if any change after escaping
776 if url != url_escaped:
777 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
779 url_escaped, data=req.data, headers=req.headers,
780 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
781 new_req.timeout = req.timeout
784 for h, v in std_headers.items():
785 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
786 # The dict keys are capitalized because of this bug by urllib
787 if h.capitalize() not in req.headers:
790 req.headers = handle_youtubedl_headers(req.headers)
792 if sys.version_info < (2, 7) and '#' in req.get_full_url():
793 # Python 2.6 is brain-dead when it comes to fragments
794 req._Request__original = req._Request__original.partition('#')[0]
795 req._Request__r_type = req._Request__r_type.partition('#')[0]
799 def http_response(self, req, resp):
802 if resp.headers.get('Content-encoding', '') == 'gzip':
803 content = resp.read()
804 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
806 uncompressed = io.BytesIO(gz.read())
807 except IOError as original_ioerror:
808 # There may be junk add the end of the file
809 # See http://stackoverflow.com/q/4928560/35070 for details
810 for i in range(1, 1024):
812 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
813 uncompressed = io.BytesIO(gz.read())
818 raise original_ioerror
819 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
820 resp.msg = old_resp.msg
821 del resp.headers['Content-encoding']
823 if resp.headers.get('Content-encoding', '') == 'deflate':
824 gz = io.BytesIO(self.deflate(resp.read()))
825 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
826 resp.msg = old_resp.msg
827 del resp.headers['Content-encoding']
828 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
829 # https://github.com/rg3/youtube-dl/issues/6457).
830 if 300 <= resp.code < 400:
831 location = resp.headers.get('Location')
833 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
834 if sys.version_info >= (3, 0):
835 location = location.encode('iso-8859-1').decode('utf-8')
836 location_escaped = escape_url(location)
837 if location != location_escaped:
838 del resp.headers['Location']
839 resp.headers['Location'] = location_escaped
842 https_request = http_request
843 https_response = http_response
846 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
847 def __init__(self, params, https_conn_class=None, *args, **kwargs):
848 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
849 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
850 self._params = params
852 def https_open(self, req):
854 if hasattr(self, '_context'): # python > 2.6
855 kwargs['context'] = self._context
856 if hasattr(self, '_check_hostname'): # python 3.x
857 kwargs['check_hostname'] = self._check_hostname
858 return self.do_open(functools.partial(
859 _create_http_connection, self, self._https_conn_class, True),
863 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
864 def __init__(self, cookiejar=None):
865 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
867 def http_response(self, request, response):
868 # Python 2 will choke on next HTTP request in row if there are non-ASCII
869 # characters in Set-Cookie HTTP header of last response (see
870 # https://github.com/rg3/youtube-dl/issues/6769).
871 # In order to at least prevent crashing we will percent encode Set-Cookie
872 # header before HTTPCookieProcessor starts processing it.
873 # if sys.version_info < (3, 0) and response.headers:
874 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
875 # set_cookie = response.headers.get(set_cookie_header)
877 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
878 # if set_cookie != set_cookie_escaped:
879 # del response.headers[set_cookie_header]
880 # response.headers[set_cookie_header] = set_cookie_escaped
881 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
883 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
884 https_response = http_response
887 def parse_iso8601(date_str, delimiter='T', timezone=None):
888 """ Return a UNIX timestamp from the given date """
893 date_str = re.sub(r'\.[0-9]+', '', date_str)
897 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
900 timezone = datetime.timedelta()
902 date_str = date_str[:-len(m.group(0))]
903 if not m.group('sign'):
904 timezone = datetime.timedelta()
906 sign = 1 if m.group('sign') == '+' else -1
907 timezone = datetime.timedelta(
908 hours=sign * int(m.group('hours')),
909 minutes=sign * int(m.group('minutes')))
911 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
912 dt = datetime.datetime.strptime(date_str, date_format) - timezone
913 return calendar.timegm(dt.timetuple())
918 def unified_strdate(date_str, day_first=True):
919 """Return a string with the date in the format YYYYMMDD"""
925 date_str = date_str.replace(',', ' ')
926 # %z (UTC offset) is only supported in python>=3.2
927 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
928 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
929 # Remove AM/PM + timezone
930 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
932 format_expressions = [
945 '%Y-%m-%d %H:%M:%S.%f',
948 '%Y-%m-%dT%H:%M:%SZ',
949 '%Y-%m-%dT%H:%M:%S.%fZ',
950 '%Y-%m-%dT%H:%M:%S.%f0Z',
952 '%Y-%m-%dT%H:%M:%S.%f',
956 format_expressions.extend([
964 format_expressions.extend([
971 for expression in format_expressions:
973 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
976 if upload_date is None:
977 timetuple = email.utils.parsedate_tz(date_str)
979 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
980 if upload_date is not None:
981 return compat_str(upload_date)
984 def determine_ext(url, default_ext='unknown_video'):
987 guess = url.partition('?')[0].rpartition('.')[2]
988 if re.match(r'^[A-Za-z0-9]+$', guess):
990 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
991 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
992 return guess.rstrip('/')
997 def subtitles_filename(filename, sub_lang, sub_format):
998 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1001 def date_from_str(date_str):
1003 Return a datetime object from a string in the format YYYYMMDD or
1004 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1005 today = datetime.date.today()
1006 if date_str in ('now', 'today'):
1008 if date_str == 'yesterday':
1009 return today - datetime.timedelta(days=1)
1010 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1011 if match is not None:
1012 sign = match.group('sign')
1013 time = int(match.group('time'))
1016 unit = match.group('unit')
1017 # A bad approximation?
1021 elif unit == 'year':
1025 delta = datetime.timedelta(**{unit: time})
1026 return today + delta
1027 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1030 def hyphenate_date(date_str):
1032 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1033 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1034 if match is not None:
1035 return '-'.join(match.groups())
1040 class DateRange(object):
1041 """Represents a time interval between two dates"""
1043 def __init__(self, start=None, end=None):
1044 """start and end must be strings in the format accepted by date"""
1045 if start is not None:
1046 self.start = date_from_str(start)
1048 self.start = datetime.datetime.min.date()
1050 self.end = date_from_str(end)
1052 self.end = datetime.datetime.max.date()
1053 if self.start > self.end:
1054 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1058 """Returns a range that only contains the given day"""
1059 return cls(day, day)
1061 def __contains__(self, date):
1062 """Check if the date is in the range"""
1063 if not isinstance(date, datetime.date):
1064 date = date_from_str(date)
1065 return self.start <= date <= self.end
1068 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1071 def platform_name():
1072 """ Returns the platform name as a compat_str """
1073 res = platform.platform()
1074 if isinstance(res, bytes):
1075 res = res.decode(preferredencoding())
1077 assert isinstance(res, compat_str)
1081 def _windows_write_string(s, out):
1082 """ Returns True if the string was written using special methods,
1083 False if it has yet to be written out."""
1084 # Adapted from http://stackoverflow.com/a/3259271/35070
1087 import ctypes.wintypes
1095 fileno = out.fileno()
1096 except AttributeError:
1097 # If the output stream doesn't have a fileno, it's virtual
1099 except io.UnsupportedOperation:
1100 # Some strange Windows pseudo files?
1102 if fileno not in WIN_OUTPUT_IDS:
1105 GetStdHandle = ctypes.WINFUNCTYPE(
1106 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1107 (b'GetStdHandle', ctypes.windll.kernel32))
1108 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1110 WriteConsoleW = ctypes.WINFUNCTYPE(
1111 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1112 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1113 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1114 written = ctypes.wintypes.DWORD(0)
1116 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1117 FILE_TYPE_CHAR = 0x0002
1118 FILE_TYPE_REMOTE = 0x8000
1119 GetConsoleMode = ctypes.WINFUNCTYPE(
1120 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1121 ctypes.POINTER(ctypes.wintypes.DWORD))(
1122 (b'GetConsoleMode', ctypes.windll.kernel32))
1123 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1125 def not_a_console(handle):
1126 if handle == INVALID_HANDLE_VALUE or handle is None:
1128 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1129 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1131 if not_a_console(h):
1134 def next_nonbmp_pos(s):
1136 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1137 except StopIteration:
1141 count = min(next_nonbmp_pos(s), 1024)
1143 ret = WriteConsoleW(
1144 h, s, count if count else 2, ctypes.byref(written), None)
1146 raise OSError('Failed to write string')
1147 if not count: # We just wrote a non-BMP character
1148 assert written.value == 2
1151 assert written.value > 0
1152 s = s[written.value:]
1156 def write_string(s, out=None, encoding=None):
1159 assert type(s) == compat_str
1161 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1162 if _windows_write_string(s, out):
1165 if ('b' in getattr(out, 'mode', '') or
1166 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1167 byt = s.encode(encoding or preferredencoding(), 'ignore')
1169 elif hasattr(out, 'buffer'):
1170 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1171 byt = s.encode(enc, 'ignore')
1172 out.buffer.write(byt)
1178 def bytes_to_intlist(bs):
1181 if isinstance(bs[0], int): # Python 3
1184 return [ord(c) for c in bs]
1187 def intlist_to_bytes(xs):
1190 return struct_pack('%dB' % len(xs), *xs)
1193 # Cross-platform file locking
1194 if sys.platform == 'win32':
1195 import ctypes.wintypes
1198 class OVERLAPPED(ctypes.Structure):
1200 ('Internal', ctypes.wintypes.LPVOID),
1201 ('InternalHigh', ctypes.wintypes.LPVOID),
1202 ('Offset', ctypes.wintypes.DWORD),
1203 ('OffsetHigh', ctypes.wintypes.DWORD),
1204 ('hEvent', ctypes.wintypes.HANDLE),
1207 kernel32 = ctypes.windll.kernel32
1208 LockFileEx = kernel32.LockFileEx
1209 LockFileEx.argtypes = [
1210 ctypes.wintypes.HANDLE, # hFile
1211 ctypes.wintypes.DWORD, # dwFlags
1212 ctypes.wintypes.DWORD, # dwReserved
1213 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1214 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1215 ctypes.POINTER(OVERLAPPED) # Overlapped
1217 LockFileEx.restype = ctypes.wintypes.BOOL
1218 UnlockFileEx = kernel32.UnlockFileEx
1219 UnlockFileEx.argtypes = [
1220 ctypes.wintypes.HANDLE, # hFile
1221 ctypes.wintypes.DWORD, # dwReserved
1222 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1223 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1224 ctypes.POINTER(OVERLAPPED) # Overlapped
1226 UnlockFileEx.restype = ctypes.wintypes.BOOL
1227 whole_low = 0xffffffff
1228 whole_high = 0x7fffffff
1230 def _lock_file(f, exclusive):
1231 overlapped = OVERLAPPED()
1232 overlapped.Offset = 0
1233 overlapped.OffsetHigh = 0
1234 overlapped.hEvent = 0
1235 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1236 handle = msvcrt.get_osfhandle(f.fileno())
1237 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1238 whole_low, whole_high, f._lock_file_overlapped_p):
1239 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1241 def _unlock_file(f):
1242 assert f._lock_file_overlapped_p
1243 handle = msvcrt.get_osfhandle(f.fileno())
1244 if not UnlockFileEx(handle, 0,
1245 whole_low, whole_high, f._lock_file_overlapped_p):
1246 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1249 # Some platforms, such as Jython, is missing fcntl
1253 def _lock_file(f, exclusive):
1254 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1256 def _unlock_file(f):
1257 fcntl.flock(f, fcntl.LOCK_UN)
1259 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1261 def _lock_file(f, exclusive):
1262 raise IOError(UNSUPPORTED_MSG)
1264 def _unlock_file(f):
1265 raise IOError(UNSUPPORTED_MSG)
1268 class locked_file(object):
1269 def __init__(self, filename, mode, encoding=None):
1270 assert mode in ['r', 'a', 'w']
1271 self.f = io.open(filename, mode, encoding=encoding)
1274 def __enter__(self):
1275 exclusive = self.mode != 'r'
1277 _lock_file(self.f, exclusive)
1283 def __exit__(self, etype, value, traceback):
1285 _unlock_file(self.f)
1292 def write(self, *args):
1293 return self.f.write(*args)
1295 def read(self, *args):
1296 return self.f.read(*args)
1299 def get_filesystem_encoding():
1300 encoding = sys.getfilesystemencoding()
1301 return encoding if encoding is not None else 'utf-8'
1304 def shell_quote(args):
1306 encoding = get_filesystem_encoding()
1308 if isinstance(a, bytes):
1309 # We may get a filename encoded with 'encodeFilename'
1310 a = a.decode(encoding)
1311 quoted_args.append(pipes.quote(a))
1312 return ' '.join(quoted_args)
1315 def smuggle_url(url, data):
1316 """ Pass additional data in a URL for internal use. """
1318 sdata = compat_urllib_parse.urlencode(
1319 {'__youtubedl_smuggle': json.dumps(data)})
1320 return url + '#' + sdata
1323 def unsmuggle_url(smug_url, default=None):
1324 if '#__youtubedl_smuggle' not in smug_url:
1325 return smug_url, default
1326 url, _, sdata = smug_url.rpartition('#')
1327 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1328 data = json.loads(jsond)
1332 def format_bytes(bytes):
1335 if type(bytes) is str:
1336 bytes = float(bytes)
1340 exponent = int(math.log(bytes, 1024.0))
1341 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1342 converted = float(bytes) / float(1024 ** exponent)
1343 return '%.2f%s' % (converted, suffix)
1346 def lookup_unit_table(unit_table, s):
1347 units_re = '|'.join(re.escape(u) for u in unit_table)
1349 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1352 num_str = m.group('num').replace(',', '.')
1353 mult = unit_table[m.group('unit')]
1354 return int(float(num_str) * mult)
1357 def parse_filesize(s):
1361 # The lower-case forms are of course incorrect and unofficial,
1362 # but we support those too
1400 return lookup_unit_table(_UNIT_TABLE, s)
1409 if re.match(r'^[\d,.]+$', s):
1410 return str_to_int(s)
1421 return lookup_unit_table(_UNIT_TABLE, s)
1424 def month_by_name(name):
1425 """ Return the number of a month by (locale-independently) English name """
1428 return ENGLISH_MONTH_NAMES.index(name) + 1
1433 def month_by_abbreviation(abbrev):
1434 """ Return the number of a month by (locale-independently) English
1438 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1443 def fix_xml_ampersands(xml_str):
1444 """Replace all the '&' by '&' in XML"""
1446 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1451 def setproctitle(title):
1452 assert isinstance(title, compat_str)
1454 # ctypes in Jython is not complete
1455 # http://bugs.jython.org/issue2148
1456 if sys.platform.startswith('java'):
1460 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1463 title_bytes = title.encode('utf-8')
1464 buf = ctypes.create_string_buffer(len(title_bytes))
1465 buf.value = title_bytes
1467 libc.prctl(15, buf, 0, 0, 0)
1468 except AttributeError:
1469 return # Strange libc, just skip this
1472 def remove_start(s, start):
1473 if s.startswith(start):
1474 return s[len(start):]
1478 def remove_end(s, end):
1480 return s[:-len(end)]
1484 def remove_quotes(s):
1485 if s is None or len(s) < 2:
1487 for quote in ('"', "'", ):
1488 if s[0] == quote and s[-1] == quote:
1493 def url_basename(url):
1494 path = compat_urlparse.urlparse(url).path
1495 return path.strip('/').split('/')[-1]
1498 class HEADRequest(compat_urllib_request.Request):
1499 def get_method(self):
1503 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1506 v = getattr(v, get_attr, None)
1512 return int(v) * invscale // scale
1517 def str_or_none(v, default=None):
1518 return default if v is None else compat_str(v)
1521 def str_to_int(int_str):
1522 """ A more relaxed version of int_or_none """
1525 int_str = re.sub(r'[,\.\+]', '', int_str)
1529 def float_or_none(v, scale=1, invscale=1, default=None):
1533 return float(v) * invscale / scale
1538 def parse_duration(s):
1539 if not isinstance(s, compat_basestring):
1547 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1548 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1550 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1553 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1554 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1556 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1558 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1563 if m.group('only_mins'):
1564 return float_or_none(m.group('only_mins'), invscale=60)
1565 if m.group('only_hours'):
1566 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1568 res += int(m.group('secs'))
1569 if m.group('mins_reversed'):
1570 res += int(m.group('mins_reversed')) * 60
1572 res += int(m.group('mins')) * 60
1573 if m.group('hours'):
1574 res += int(m.group('hours')) * 60 * 60
1575 if m.group('hours_reversed'):
1576 res += int(m.group('hours_reversed')) * 60 * 60
1578 res += int(m.group('days')) * 24 * 60 * 60
1580 res += float(m.group('ms'))
1584 def prepend_extension(filename, ext, expected_real_ext=None):
1585 name, real_ext = os.path.splitext(filename)
1587 '{0}.{1}{2}'.format(name, ext, real_ext)
1588 if not expected_real_ext or real_ext[1:] == expected_real_ext
1589 else '{0}.{1}'.format(filename, ext))
1592 def replace_extension(filename, ext, expected_real_ext=None):
1593 name, real_ext = os.path.splitext(filename)
1594 return '{0}.{1}'.format(
1595 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1599 def check_executable(exe, args=[]):
1600 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1601 args can be a list of arguments for a short output (like -version) """
1603 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1609 def get_exe_version(exe, args=['--version'],
1610 version_re=None, unrecognized='present'):
1611 """ Returns the version of the specified executable,
1612 or False if the executable is not present """
1614 out, _ = subprocess.Popen(
1615 [encodeArgument(exe)] + args,
1616 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1619 if isinstance(out, bytes): # Python 2.x
1620 out = out.decode('ascii', 'ignore')
1621 return detect_exe_version(out, version_re, unrecognized)
1624 def detect_exe_version(output, version_re=None, unrecognized='present'):
1625 assert isinstance(output, compat_str)
1626 if version_re is None:
1627 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1628 m = re.search(version_re, output)
1635 class PagedList(object):
1637 # This is only useful for tests
1638 return len(self.getslice())
1641 class OnDemandPagedList(PagedList):
1642 def __init__(self, pagefunc, pagesize, use_cache=False):
1643 self._pagefunc = pagefunc
1644 self._pagesize = pagesize
1645 self._use_cache = use_cache
1649 def getslice(self, start=0, end=None):
1651 for pagenum in itertools.count(start // self._pagesize):
1652 firstid = pagenum * self._pagesize
1653 nextfirstid = pagenum * self._pagesize + self._pagesize
1654 if start >= nextfirstid:
1659 page_results = self._cache.get(pagenum)
1660 if page_results is None:
1661 page_results = list(self._pagefunc(pagenum))
1663 self._cache[pagenum] = page_results
1666 start % self._pagesize
1667 if firstid <= start < nextfirstid
1671 ((end - 1) % self._pagesize) + 1
1672 if (end is not None and firstid <= end <= nextfirstid)
1675 if startv != 0 or endv is not None:
1676 page_results = page_results[startv:endv]
1677 res.extend(page_results)
1679 # A little optimization - if current page is not "full", ie. does
1680 # not contain page_size videos then we can assume that this page
1681 # is the last one - there are no more ids on further pages -
1682 # i.e. no need to query again.
1683 if len(page_results) + startv < self._pagesize:
1686 # If we got the whole page, but the next page is not interesting,
1687 # break out early as well
1688 if end == nextfirstid:
1693 class InAdvancePagedList(PagedList):
1694 def __init__(self, pagefunc, pagecount, pagesize):
1695 self._pagefunc = pagefunc
1696 self._pagecount = pagecount
1697 self._pagesize = pagesize
1699 def getslice(self, start=0, end=None):
1701 start_page = start // self._pagesize
1703 self._pagecount if end is None else (end // self._pagesize + 1))
1704 skip_elems = start - start_page * self._pagesize
1705 only_more = None if end is None else end - start
1706 for pagenum in range(start_page, end_page):
1707 page = list(self._pagefunc(pagenum))
1709 page = page[skip_elems:]
1711 if only_more is not None:
1712 if len(page) < only_more:
1713 only_more -= len(page)
1715 page = page[:only_more]
1722 def uppercase_escape(s):
1723 unicode_escape = codecs.getdecoder('unicode_escape')
1725 r'\\U[0-9a-fA-F]{8}',
1726 lambda m: unicode_escape(m.group(0))[0],
1730 def lowercase_escape(s):
1731 unicode_escape = codecs.getdecoder('unicode_escape')
1733 r'\\u[0-9a-fA-F]{4}',
1734 lambda m: unicode_escape(m.group(0))[0],
1738 def escape_rfc3986(s):
1739 """Escape non-ASCII characters as suggested by RFC 3986"""
1740 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1741 s = s.encode('utf-8')
1742 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1745 def escape_url(url):
1746 """Escape URL as suggested by RFC 3986"""
1747 url_parsed = compat_urllib_parse_urlparse(url)
1748 return url_parsed._replace(
1749 path=escape_rfc3986(url_parsed.path),
1750 params=escape_rfc3986(url_parsed.params),
1751 query=escape_rfc3986(url_parsed.query),
1752 fragment=escape_rfc3986(url_parsed.fragment)
1756 struct.pack('!I', 0)
1758 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1759 def struct_pack(spec, *args):
1760 if isinstance(spec, compat_str):
1761 spec = spec.encode('ascii')
1762 return struct.pack(spec, *args)
1764 def struct_unpack(spec, *args):
1765 if isinstance(spec, compat_str):
1766 spec = spec.encode('ascii')
1767 return struct.unpack(spec, *args)
1769 struct_pack = struct.pack
1770 struct_unpack = struct.unpack
1773 def read_batch_urls(batch_fd):
1775 if not isinstance(url, compat_str):
1776 url = url.decode('utf-8', 'replace')
1777 BOM_UTF8 = '\xef\xbb\xbf'
1778 if url.startswith(BOM_UTF8):
1779 url = url[len(BOM_UTF8):]
1781 if url.startswith(('#', ';', ']')):
1785 with contextlib.closing(batch_fd) as fd:
1786 return [url for url in map(fixup, fd) if url]
1789 def urlencode_postdata(*args, **kargs):
1790 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1793 def update_url_query(url, query):
1794 parsed_url = compat_urlparse.urlparse(url)
1795 qs = compat_parse_qs(parsed_url.query)
1797 qs = encode_dict(qs)
1798 return compat_urlparse.urlunparse(parsed_url._replace(
1799 query=compat_urllib_parse.urlencode(qs, True)))
1802 def encode_dict(d, encoding='utf-8'):
1804 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1805 return dict((encode(k), encode(v)) for k, v in d.items())
1808 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1809 if isinstance(key_or_keys, (list, tuple)):
1810 for key in key_or_keys:
1811 if key not in d or d[key] is None or skip_false_values and not d[key]:
1815 return d.get(key_or_keys, default)
1818 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1819 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1831 def parse_age_limit(s):
1834 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1835 return int(m.group('age')) if m else US_RATINGS.get(s)
1838 def strip_jsonp(code):
1840 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1843 def js_to_json(code):
1846 if v in ('true', 'false', 'null'):
1848 if v.startswith('"'):
1849 v = re.sub(r"\\'", "'", v[1:-1])
1850 elif v.startswith("'"):
1852 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1859 res = re.sub(r'''(?x)
1860 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1861 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1862 [a-zA-Z_][.a-zA-Z_0-9]*
1864 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1868 def qualities(quality_ids):
1869 """ Get a numeric quality value out of a list of possible values """
1872 return quality_ids.index(qid)
1878 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1881 def limit_length(s, length):
1882 """ Add ellipses to overly long strings """
1887 return s[:length - len(ELLIPSES)] + ELLIPSES
1891 def version_tuple(v):
1892 return tuple(int(e) for e in re.split(r'[-.]', v))
1895 def is_outdated_version(version, limit, assume_new=True):
1897 return not assume_new
1899 return version_tuple(version) < version_tuple(limit)
1901 return not assume_new
1904 def ytdl_is_updateable():
1905 """ Returns if youtube-dl can be updated with -U """
1906 from zipimport import zipimporter
1908 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1911 def args_to_str(args):
1912 # Get a short string representation for a subprocess command
1913 return ' '.join(shlex_quote(a) for a in args)
1916 def error_to_compat_str(err):
1918 # On python 2 error byte string must be decoded with proper
1919 # encoding rather than ascii
1920 if sys.version_info[0] < 3:
1921 err_str = err_str.decode(preferredencoding())
1925 def mimetype2ext(mt):
1932 _, _, res = mt.rpartition('/')
1936 'smptett+xml': 'tt',
1942 'x-mp4-fragmented': 'mp4',
1947 def urlhandle_detect_ext(url_handle):
1950 getheader = lambda h: url_handle.headers[h]
1951 except AttributeError: # Python < 3
1952 getheader = url_handle.info().getheader
1954 cd = getheader('Content-Disposition')
1956 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1958 e = determine_ext(m.group('filename'), default_ext=None)
1962 return mimetype2ext(getheader('Content-Type'))
1965 def encode_data_uri(data, mime_type):
1966 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1969 def age_restricted(content_limit, age_limit):
1970 """ Returns True iff the content should be blocked """
1972 if age_limit is None: # No limit set
1974 if content_limit is None:
1975 return False # Content available for everyone
1976 return age_limit < content_limit
1979 def is_html(first_bytes):
1980 """ Detect whether a file contains HTML by examining its first bytes. """
1983 (b'\xef\xbb\xbf', 'utf-8'),
1984 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1985 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1986 (b'\xff\xfe', 'utf-16-le'),
1987 (b'\xfe\xff', 'utf-16-be'),
1989 for bom, enc in BOMS:
1990 if first_bytes.startswith(bom):
1991 s = first_bytes[len(bom):].decode(enc, 'replace')
1994 s = first_bytes.decode('utf-8', 'replace')
1996 return re.match(r'^\s*<', s)
1999 def determine_protocol(info_dict):
2000 protocol = info_dict.get('protocol')
2001 if protocol is not None:
2004 url = info_dict['url']
2005 if url.startswith('rtmp'):
2007 elif url.startswith('mms'):
2009 elif url.startswith('rtsp'):
2012 ext = determine_ext(url)
2018 return compat_urllib_parse_urlparse(url).scheme
2021 def render_table(header_row, data):
2022 """ Render a list of rows, each as a list of values """
2023 table = [header_row] + data
2024 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2025 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2026 return '\n'.join(format_str % tuple(row) for row in table)
2029 def _match_one(filter_part, dct):
2030 COMPARISON_OPERATORS = {
2038 operator_rex = re.compile(r'''(?x)\s*
2040 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2042 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2043 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2046 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2047 m = operator_rex.search(filter_part)
2049 op = COMPARISON_OPERATORS[m.group('op')]
2050 if m.group('strval') is not None:
2051 if m.group('op') not in ('=', '!='):
2053 'Operator %s does not support string values!' % m.group('op'))
2054 comparison_value = m.group('strval')
2057 comparison_value = int(m.group('intval'))
2059 comparison_value = parse_filesize(m.group('intval'))
2060 if comparison_value is None:
2061 comparison_value = parse_filesize(m.group('intval') + 'B')
2062 if comparison_value is None:
2064 'Invalid integer value %r in filter part %r' % (
2065 m.group('intval'), filter_part))
2066 actual_value = dct.get(m.group('key'))
2067 if actual_value is None:
2068 return m.group('none_inclusive')
2069 return op(actual_value, comparison_value)
2072 '': lambda v: v is not None,
2073 '!': lambda v: v is None,
2075 operator_rex = re.compile(r'''(?x)\s*
2076 (?P<op>%s)\s*(?P<key>[a-z_]+)
2078 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2079 m = operator_rex.search(filter_part)
2081 op = UNARY_OPERATORS[m.group('op')]
2082 actual_value = dct.get(m.group('key'))
2083 return op(actual_value)
2085 raise ValueError('Invalid filter part %r' % filter_part)
2088 def match_str(filter_str, dct):
2089 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2092 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2095 def match_filter_func(filter_str):
2096 def _match_func(info_dict):
2097 if match_str(filter_str, info_dict):
2100 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2101 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2105 def parse_dfxp_time_expr(time_expr):
2109 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2111 return float(mobj.group('time_offset'))
2113 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2115 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2118 def srt_subtitles_timecode(seconds):
2119 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2122 def dfxp2srt(dfxp_data):
2123 _x = functools.partial(xpath_with_ns, ns_map={
2124 'ttml': 'http://www.w3.org/ns/ttml',
2125 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2128 class TTMLPElementParser(object):
2131 def start(self, tag, attrib):
2132 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2138 def data(self, data):
2142 return self.out.strip()
2144 def parse_node(node):
2145 target = TTMLPElementParser()
2146 parser = xml.etree.ElementTree.XMLParser(target=target)
2147 parser.feed(xml.etree.ElementTree.tostring(node))
2148 return parser.close()
2150 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2152 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2155 raise ValueError('Invalid dfxp/TTML subtitle')
2157 for para, index in zip(paras, itertools.count(1)):
2158 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2159 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2160 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2161 if begin_time is None:
2166 end_time = begin_time + dur
2167 out.append('%d\n%s --> %s\n%s\n\n' % (
2169 srt_subtitles_timecode(begin_time),
2170 srt_subtitles_timecode(end_time),
2176 def cli_option(params, command_option, param):
2177 param = params.get(param)
2178 return [command_option, param] if param is not None else []
2181 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2182 param = params.get(param)
2183 assert isinstance(param, bool)
2185 return [command_option + separator + (true_value if param else false_value)]
2186 return [command_option, true_value if param else false_value]
2189 def cli_valueless_option(params, command_option, param, expected_value=True):
2190 param = params.get(param)
2191 return [command_option] if param == expected_value else []
2194 def cli_configuration_args(params, param, default=[]):
2195 ex_args = params.get(param)
2198 assert isinstance(ex_args, list)
2202 class ISO639Utils(object):
2203 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2392 def short2long(cls, code):
2393 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2394 return cls._lang_map.get(code[:2])
2397 def long2short(cls, code):
2398 """Convert language code from ISO 639-2/T to ISO 639-1"""
2399 for short_name, long_name in cls._lang_map.items():
2400 if long_name == code:
2404 class ISO3166Utils(object):
2405 # From http://data.okfn.org/data/core/country-list
2407 'AF': 'Afghanistan',
2408 'AX': 'Ã…land Islands',
2411 'AS': 'American Samoa',
2416 'AG': 'Antigua and Barbuda',
2433 'BO': 'Bolivia, Plurinational State of',
2434 'BQ': 'Bonaire, Sint Eustatius and Saba',
2435 'BA': 'Bosnia and Herzegovina',
2437 'BV': 'Bouvet Island',
2439 'IO': 'British Indian Ocean Territory',
2440 'BN': 'Brunei Darussalam',
2442 'BF': 'Burkina Faso',
2448 'KY': 'Cayman Islands',
2449 'CF': 'Central African Republic',
2453 'CX': 'Christmas Island',
2454 'CC': 'Cocos (Keeling) Islands',
2458 'CD': 'Congo, the Democratic Republic of the',
2459 'CK': 'Cook Islands',
2461 'CI': 'Côte d\'Ivoire',
2466 'CZ': 'Czech Republic',
2470 'DO': 'Dominican Republic',
2473 'SV': 'El Salvador',
2474 'GQ': 'Equatorial Guinea',
2478 'FK': 'Falkland Islands (Malvinas)',
2479 'FO': 'Faroe Islands',
2483 'GF': 'French Guiana',
2484 'PF': 'French Polynesia',
2485 'TF': 'French Southern Territories',
2500 'GW': 'Guinea-Bissau',
2503 'HM': 'Heard Island and McDonald Islands',
2504 'VA': 'Holy See (Vatican City State)',
2511 'IR': 'Iran, Islamic Republic of',
2514 'IM': 'Isle of Man',
2524 'KP': 'Korea, Democratic People\'s Republic of',
2525 'KR': 'Korea, Republic of',
2528 'LA': 'Lao People\'s Democratic Republic',
2534 'LI': 'Liechtenstein',
2538 'MK': 'Macedonia, the Former Yugoslav Republic of',
2545 'MH': 'Marshall Islands',
2551 'FM': 'Micronesia, Federated States of',
2552 'MD': 'Moldova, Republic of',
2563 'NL': 'Netherlands',
2564 'NC': 'New Caledonia',
2565 'NZ': 'New Zealand',
2570 'NF': 'Norfolk Island',
2571 'MP': 'Northern Mariana Islands',
2576 'PS': 'Palestine, State of',
2578 'PG': 'Papua New Guinea',
2581 'PH': 'Philippines',
2585 'PR': 'Puerto Rico',
2589 'RU': 'Russian Federation',
2591 'BL': 'Saint Barthélemy',
2592 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2593 'KN': 'Saint Kitts and Nevis',
2594 'LC': 'Saint Lucia',
2595 'MF': 'Saint Martin (French part)',
2596 'PM': 'Saint Pierre and Miquelon',
2597 'VC': 'Saint Vincent and the Grenadines',
2600 'ST': 'Sao Tome and Principe',
2601 'SA': 'Saudi Arabia',
2605 'SL': 'Sierra Leone',
2607 'SX': 'Sint Maarten (Dutch part)',
2610 'SB': 'Solomon Islands',
2612 'ZA': 'South Africa',
2613 'GS': 'South Georgia and the South Sandwich Islands',
2614 'SS': 'South Sudan',
2619 'SJ': 'Svalbard and Jan Mayen',
2622 'CH': 'Switzerland',
2623 'SY': 'Syrian Arab Republic',
2624 'TW': 'Taiwan, Province of China',
2626 'TZ': 'Tanzania, United Republic of',
2628 'TL': 'Timor-Leste',
2632 'TT': 'Trinidad and Tobago',
2635 'TM': 'Turkmenistan',
2636 'TC': 'Turks and Caicos Islands',
2640 'AE': 'United Arab Emirates',
2641 'GB': 'United Kingdom',
2642 'US': 'United States',
2643 'UM': 'United States Minor Outlying Islands',
2647 'VE': 'Venezuela, Bolivarian Republic of',
2649 'VG': 'Virgin Islands, British',
2650 'VI': 'Virgin Islands, U.S.',
2651 'WF': 'Wallis and Futuna',
2652 'EH': 'Western Sahara',
2659 def short2full(cls, code):
2660 """Convert an ISO 3166-2 country code to the corresponding full name"""
2661 return cls._country_map.get(code.upper())
2664 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2665 def __init__(self, proxies=None):
2666 # Set default handlers
2667 for type in ('http', 'https'):
2668 setattr(self, '%s_open' % type,
2669 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2670 meth(r, proxy, type))
2671 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2673 def proxy_open(self, req, proxy, type):
2674 req_proxy = req.headers.get('Ytdl-request-proxy')
2675 if req_proxy is not None:
2677 del req.headers['Ytdl-request-proxy']
2679 if proxy == '__noproxy__':
2680 return None # No Proxy
2681 return compat_urllib_request.ProxyHandler.proxy_open(
2682 self, req, proxy, type)
2685 def ohdave_rsa_encrypt(data, exponent, modulus):
2687 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2690 data: data to encrypt, bytes-like object
2691 exponent, modulus: parameter e and N of RSA algorithm, both integer
2692 Output: hex string of encrypted data
2694 Limitation: supports one block encryption only
2697 payload = int(binascii.hexlify(data[::-1]), 16)
2698 encrypted = pow(payload, exponent, modulus)
2699 return '%x' % encrypted
2702 def encode_base_n(num, n, table=None):
2703 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2705 table = FULL_TABLE[:n]
2708 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2715 ret = table[num % n] + ret
2720 def decode_packed_codes(code):
2722 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2724 obfucasted_code, base, count, symbols = mobj.groups()
2727 symbols = symbols.split('|')
2732 base_n_count = encode_base_n(count, base)
2733 symbol_table[base_n_count] = symbols[count] or base_n_count
2736 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],