2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
41 compat_etree_fromstring,
46 compat_socket_create_connection,
50 compat_urllib_parse_urlencode,
51 compat_urllib_parse_urlparse,
52 compat_urllib_request,
59 # This is not clearly defined otherwise
60 compiled_regex_type = type(re.compile(''))
63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
73 ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
90 'f4f', 'f4m', 'm3u8', 'smil')
93 def preferredencoding():
94 """Get preferred encoding.
96 Returns the best encoding scheme for the system, based on
97 locale.getpreferredencoding() and some further tweaks.
100 pref = locale.getpreferredencoding()
108 def write_json_file(obj, fn):
109 """ Encode obj as JSON and write it to fn, atomically if possible """
111 fn = encodeFilename(fn)
112 if sys.version_info < (3, 0) and sys.platform != 'win32':
113 encoding = get_filesystem_encoding()
114 # os.path.basename returns a bytes object, but NamedTemporaryFile
115 # will fail if the filename contains non ascii characters unless we
116 # use a unicode object
117 path_basename = lambda f: os.path.basename(fn).decode(encoding)
118 # the same for os.path.dirname
119 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
121 path_basename = os.path.basename
122 path_dirname = os.path.dirname
126 'prefix': path_basename(fn) + '.',
127 'dir': path_dirname(fn),
131 # In Python 2.x, json.dump expects a bytestream.
132 # In Python 3.x, it writes to a character stream
133 if sys.version_info < (3, 0):
141 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
146 if sys.platform == 'win32':
147 # Need to remove existing file on Windows, else os.rename raises
148 # WindowsError or FileExistsError.
153 os.rename(tf.name, fn)
162 if sys.version_info >= (2, 7):
163 def find_xpath_attr(node, xpath, key, val=None):
164 """ Find the xpath xpath[@key=val] """
165 assert re.match(r'^[a-zA-Z_-]+$', key)
166 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
167 return node.find(expr)
169 def find_xpath_attr(node, xpath, key, val=None):
170 for f in node.findall(compat_xpath(xpath)):
171 if key not in f.attrib:
173 if val is None or f.attrib.get(key) == val:
177 # On python2.6 the xml.etree.ElementTree.Element methods don't support
178 # the namespace parameter
181 def xpath_with_ns(path, ns_map):
182 components = [c.split(':') for c in path.split('/')]
186 replaced.append(c[0])
189 replaced.append('{%s}%s' % (ns_map[ns], tag))
190 return '/'.join(replaced)
193 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
194 def _find_xpath(xpath):
195 return node.find(compat_xpath(xpath))
197 if isinstance(xpath, (str, compat_str)):
198 n = _find_xpath(xpath)
206 if default is not NO_DEFAULT:
209 name = xpath if name is None else name
210 raise ExtractorError('Could not find XML element %s' % name)
216 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
217 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
218 if n is None or n == default:
221 if default is not NO_DEFAULT:
224 name = xpath if name is None else name
225 raise ExtractorError('Could not find XML element\'s text %s' % name)
231 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
232 n = find_xpath_attr(node, xpath, key)
234 if default is not NO_DEFAULT:
237 name = '%s[@%s]' % (xpath, key) if name is None else name
238 raise ExtractorError('Could not find XML attribute %s' % name)
244 def get_element_by_id(id, html):
245 """Return the content of the tag with the specified ID in the passed HTML document"""
246 return get_element_by_attribute('id', id, html)
249 def get_element_by_attribute(attribute, value, html):
250 """Return the content of the tag with the specified attribute in the passed HTML document"""
252 m = re.search(r'''(?xs)
254 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
256 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
260 ''' % (re.escape(attribute), re.escape(value)), html)
264 res = m.group('content')
266 if res.startswith('"') or res.startswith("'"):
269 return unescapeHTML(res)
272 class HTMLAttributeParser(compat_HTMLParser):
273 """Trivial HTML parser to gather the attributes for a single element"""
276 compat_HTMLParser.__init__(self)
278 def handle_starttag(self, tag, attrs):
279 self.attrs = dict(attrs)
282 def extract_attributes(html_element):
283 """Given a string for an HTML element such as
285 a="foo" B="bar" c="&98;az" d=boz
286 empty= noval entity="&"
289 Decode and return a dictionary of attributes.
291 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
292 'empty': '', 'noval': None, 'entity': '&',
293 'sq': '"', 'dq': '\''
295 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
296 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
298 parser = HTMLAttributeParser()
299 parser.feed(html_element)
304 def clean_html(html):
305 """Clean an HTML snippet into a readable string"""
307 if html is None: # Convenience for sanitizing descriptions etc.
311 html = html.replace('\n', ' ')
312 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
313 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
315 html = re.sub('<.*?>', '', html)
316 # Replace html entities
317 html = unescapeHTML(html)
321 def sanitize_open(filename, open_mode):
322 """Try to open the given filename, and slightly tweak it if this fails.
324 Attempts to open the given filename. If this fails, it tries to change
325 the filename slightly, step by step, until it's either able to open it
326 or it fails and raises a final exception, like the standard open()
329 It returns the tuple (stream, definitive_file_name).
333 if sys.platform == 'win32':
335 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
336 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
337 stream = open(encodeFilename(filename), open_mode)
338 return (stream, filename)
339 except (IOError, OSError) as err:
340 if err.errno in (errno.EACCES,):
343 # In case of error, try to remove win32 forbidden chars
344 alt_filename = sanitize_path(filename)
345 if alt_filename == filename:
348 # An exception here should be caught in the caller
349 stream = open(encodeFilename(alt_filename), open_mode)
350 return (stream, alt_filename)
353 def timeconvert(timestr):
354 """Convert RFC 2822 defined time string into system timestamp"""
356 timetuple = email.utils.parsedate_tz(timestr)
357 if timetuple is not None:
358 timestamp = email.utils.mktime_tz(timetuple)
362 def sanitize_filename(s, restricted=False, is_id=False):
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
367 def replace_insane(char):
368 if char == '?' or ord(char) < 32 or ord(char) == 127:
371 return '' if restricted else '\''
373 return '_-' if restricted else ' -'
374 elif char in '\\/|*<>':
376 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
378 if restricted and ord(char) > 127:
383 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
384 result = ''.join(map(replace_insane, s))
386 while '__' in result:
387 result = result.replace('__', '_')
388 result = result.strip('_')
389 # Common case of "Foreign band name - English song title"
390 if restricted and result.startswith('-_'):
392 if result.startswith('-'):
393 result = '_' + result[len('-'):]
394 result = result.lstrip('.')
400 def sanitize_path(s):
401 """Sanitizes and normalizes path on Windows"""
402 if sys.platform != 'win32':
404 drive_or_unc, _ = os.path.splitdrive(s)
405 if sys.version_info < (2, 7) and not drive_or_unc:
406 drive_or_unc, _ = os.path.splitunc(s)
407 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
411 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
412 for path_part in norm_path]
414 sanitized_path.insert(0, drive_or_unc + os.path.sep)
415 return os.path.join(*sanitized_path)
418 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
419 # unwanted failures due to missing protocol
420 def sanitize_url(url):
421 return 'http:%s' % url if url.startswith('//') else url
424 def sanitized_Request(url, *args, **kwargs):
425 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
428 def orderedSet(iterable):
429 """ Remove all duplicates from the input iterable """
437 def _htmlentity_transform(entity):
438 """Transforms an HTML entity to a character."""
439 # Known non-numeric HTML entity
440 if entity in compat_html_entities.name2codepoint:
441 return compat_chr(compat_html_entities.name2codepoint[entity])
443 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
445 numstr = mobj.group(1)
446 if numstr.startswith('x'):
448 numstr = '0%s' % numstr
451 # See https://github.com/rg3/youtube-dl/issues/7518
453 return compat_chr(int(numstr, base))
457 # Unknown entity in name, return its literal representation
458 return '&%s;' % entity
464 assert type(s) == compat_str
467 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
470 def get_subprocess_encoding():
471 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
472 # For subprocess calls, encode with locale encoding
473 # Refer to http://stackoverflow.com/a/9951851/35070
474 encoding = preferredencoding()
476 encoding = sys.getfilesystemencoding()
482 def encodeFilename(s, for_subprocess=False):
484 @param s The name of the file
487 assert type(s) == compat_str
489 # Python 3 has a Unicode API
490 if sys.version_info >= (3, 0):
493 # Pass '' directly to use Unicode APIs on Windows 2000 and up
494 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
495 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
496 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
499 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
500 if sys.platform.startswith('java'):
503 return s.encode(get_subprocess_encoding(), 'ignore')
506 def decodeFilename(b, for_subprocess=False):
508 if sys.version_info >= (3, 0):
511 if not isinstance(b, bytes):
514 return b.decode(get_subprocess_encoding(), 'ignore')
517 def encodeArgument(s):
518 if not isinstance(s, compat_str):
519 # Legacy code that uses byte strings
520 # Uncomment the following line after fixing all post processors
521 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
522 s = s.decode('ascii')
523 return encodeFilename(s, True)
526 def decodeArgument(b):
527 return decodeFilename(b, True)
530 def decodeOption(optval):
533 if isinstance(optval, bytes):
534 optval = optval.decode(preferredencoding())
536 assert isinstance(optval, compat_str)
540 def formatSeconds(secs):
542 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
544 return '%d:%02d' % (secs // 60, secs % 60)
549 def make_HTTPS_handler(params, **kwargs):
550 opts_no_check_certificate = params.get('nocheckcertificate', False)
551 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
552 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
553 if opts_no_check_certificate:
554 context.check_hostname = False
555 context.verify_mode = ssl.CERT_NONE
557 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
560 # (create_default_context present but HTTPSHandler has no context=)
563 if sys.version_info < (3, 2):
564 return YoutubeDLHTTPSHandler(params, **kwargs)
566 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
567 context.verify_mode = (ssl.CERT_NONE
568 if opts_no_check_certificate
569 else ssl.CERT_REQUIRED)
570 context.set_default_verify_paths()
571 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
574 def bug_reports_message():
575 if ytdl_is_updateable():
576 update_cmd = 'type youtube-dl -U to update'
578 update_cmd = 'see https://yt-dl.org/update on how to update'
579 msg = '; please report this issue on https://yt-dl.org/bug .'
580 msg += ' Make sure you are using the latest version; %s.' % update_cmd
581 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
585 class ExtractorError(Exception):
586 """Error during info extraction."""
588 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
589 """ tb, if given, is the original traceback (so that it can be printed out).
590 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
593 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
595 if video_id is not None:
596 msg = video_id + ': ' + msg
598 msg += ' (caused by %r)' % cause
600 msg += bug_reports_message()
601 super(ExtractorError, self).__init__(msg)
604 self.exc_info = sys.exc_info() # preserve original exception
606 self.video_id = video_id
608 def format_traceback(self):
609 if self.traceback is None:
611 return ''.join(traceback.format_tb(self.traceback))
614 class UnsupportedError(ExtractorError):
615 def __init__(self, url):
616 super(UnsupportedError, self).__init__(
617 'Unsupported URL: %s' % url, expected=True)
621 class RegexNotFoundError(ExtractorError):
622 """Error when a regex didn't match"""
626 class DownloadError(Exception):
627 """Download Error exception.
629 This exception may be thrown by FileDownloader objects if they are not
630 configured to continue on errors. They will contain the appropriate
634 def __init__(self, msg, exc_info=None):
635 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
636 super(DownloadError, self).__init__(msg)
637 self.exc_info = exc_info
640 class SameFileError(Exception):
641 """Same File exception.
643 This exception will be thrown by FileDownloader objects if they detect
644 multiple files would have to be downloaded to the same file on disk.
649 class PostProcessingError(Exception):
650 """Post Processing exception.
652 This exception may be raised by PostProcessor's .run() method to
653 indicate an error in the postprocessing task.
656 def __init__(self, msg):
660 class MaxDownloadsReached(Exception):
661 """ --max-downloads limit has been reached. """
665 class UnavailableVideoError(Exception):
666 """Unavailable Format exception.
668 This exception will be thrown when a video is requested
669 in a format that is not available for that video.
674 class ContentTooShortError(Exception):
675 """Content Too Short exception.
677 This exception may be raised by FileDownloader objects when a file they
678 download is too small for what the server announced first, indicating
679 the connection was probably interrupted.
682 def __init__(self, downloaded, expected):
684 self.downloaded = downloaded
685 self.expected = expected
688 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
689 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
690 # expected HTTP responses to meet HTTP/1.0 or later (see also
691 # https://github.com/rg3/youtube-dl/issues/6727)
692 if sys.version_info < (3, 0):
693 kwargs[b'strict'] = True
694 hc = http_class(*args, **kwargs)
695 source_address = ydl_handler._params.get('source_address')
696 if source_address is not None:
697 sa = (source_address, 0)
698 if hasattr(hc, 'source_address'): # Python 2.7+
699 hc.source_address = sa
701 def _hc_connect(self, *args, **kwargs):
702 sock = compat_socket_create_connection(
703 (self.host, self.port), self.timeout, sa)
705 self.sock = ssl.wrap_socket(
706 sock, self.key_file, self.cert_file,
707 ssl_version=ssl.PROTOCOL_TLSv1)
710 hc.connect = functools.partial(_hc_connect, hc)
715 def handle_youtubedl_headers(headers):
716 filtered_headers = headers
718 if 'Youtubedl-no-compression' in filtered_headers:
719 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
720 del filtered_headers['Youtubedl-no-compression']
722 return filtered_headers
725 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
726 """Handler for HTTP requests and responses.
728 This class, when installed with an OpenerDirector, automatically adds
729 the standard headers to every HTTP request and handles gzipped and
730 deflated responses from web servers. If compression is to be avoided in
731 a particular request, the original request in the program code only has
732 to include the HTTP header "Youtubedl-no-compression", which will be
733 removed before making the real request.
735 Part of this code was copied from:
737 http://techknack.net/python-urllib2-handlers/
739 Andrew Rowls, the author of that code, agreed to release it to the
743 def __init__(self, params, *args, **kwargs):
744 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
745 self._params = params
747 def http_open(self, req):
748 return self.do_open(functools.partial(
749 _create_http_connection, self, compat_http_client.HTTPConnection, False),
755 return zlib.decompress(data, -zlib.MAX_WBITS)
757 return zlib.decompress(data)
760 def addinfourl_wrapper(stream, headers, url, code):
761 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
762 return compat_urllib_request.addinfourl(stream, headers, url, code)
763 ret = compat_urllib_request.addinfourl(stream, headers, url)
767 def http_request(self, req):
768 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
769 # always respected by websites, some tend to give out URLs with non percent-encoded
770 # non-ASCII characters (see telemb.py, ard.py [#3412])
771 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
772 # To work around aforementioned issue we will replace request's original URL with
773 # percent-encoded one
774 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
775 # the code of this workaround has been moved here from YoutubeDL.urlopen()
776 url = req.get_full_url()
777 url_escaped = escape_url(url)
779 # Substitute URL if any change after escaping
780 if url != url_escaped:
781 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
783 url_escaped, data=req.data, headers=req.headers,
784 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
785 new_req.timeout = req.timeout
788 for h, v in std_headers.items():
789 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
790 # The dict keys are capitalized because of this bug by urllib
791 if h.capitalize() not in req.headers:
794 req.headers = handle_youtubedl_headers(req.headers)
796 if sys.version_info < (2, 7) and '#' in req.get_full_url():
797 # Python 2.6 is brain-dead when it comes to fragments
798 req._Request__original = req._Request__original.partition('#')[0]
799 req._Request__r_type = req._Request__r_type.partition('#')[0]
803 def http_response(self, req, resp):
806 if resp.headers.get('Content-encoding', '') == 'gzip':
807 content = resp.read()
808 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
810 uncompressed = io.BytesIO(gz.read())
811 except IOError as original_ioerror:
812 # There may be junk add the end of the file
813 # See http://stackoverflow.com/q/4928560/35070 for details
814 for i in range(1, 1024):
816 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
817 uncompressed = io.BytesIO(gz.read())
822 raise original_ioerror
823 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
824 resp.msg = old_resp.msg
825 del resp.headers['Content-encoding']
827 if resp.headers.get('Content-encoding', '') == 'deflate':
828 gz = io.BytesIO(self.deflate(resp.read()))
829 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
830 resp.msg = old_resp.msg
831 del resp.headers['Content-encoding']
832 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
833 # https://github.com/rg3/youtube-dl/issues/6457).
834 if 300 <= resp.code < 400:
835 location = resp.headers.get('Location')
837 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
838 if sys.version_info >= (3, 0):
839 location = location.encode('iso-8859-1').decode('utf-8')
840 location_escaped = escape_url(location)
841 if location != location_escaped:
842 del resp.headers['Location']
843 resp.headers['Location'] = location_escaped
846 https_request = http_request
847 https_response = http_response
850 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
851 def __init__(self, params, https_conn_class=None, *args, **kwargs):
852 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
853 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
854 self._params = params
856 def https_open(self, req):
858 if hasattr(self, '_context'): # python > 2.6
859 kwargs['context'] = self._context
860 if hasattr(self, '_check_hostname'): # python 3.x
861 kwargs['check_hostname'] = self._check_hostname
862 return self.do_open(functools.partial(
863 _create_http_connection, self, self._https_conn_class, True),
867 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
868 def __init__(self, cookiejar=None):
869 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
871 def http_response(self, request, response):
872 # Python 2 will choke on next HTTP request in row if there are non-ASCII
873 # characters in Set-Cookie HTTP header of last response (see
874 # https://github.com/rg3/youtube-dl/issues/6769).
875 # In order to at least prevent crashing we will percent encode Set-Cookie
876 # header before HTTPCookieProcessor starts processing it.
877 # if sys.version_info < (3, 0) and response.headers:
878 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
879 # set_cookie = response.headers.get(set_cookie_header)
881 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
882 # if set_cookie != set_cookie_escaped:
883 # del response.headers[set_cookie_header]
884 # response.headers[set_cookie_header] = set_cookie_escaped
885 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
887 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
888 https_response = http_response
891 def parse_iso8601(date_str, delimiter='T', timezone=None):
892 """ Return a UNIX timestamp from the given date """
897 date_str = re.sub(r'\.[0-9]+', '', date_str)
901 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
904 timezone = datetime.timedelta()
906 date_str = date_str[:-len(m.group(0))]
907 if not m.group('sign'):
908 timezone = datetime.timedelta()
910 sign = 1 if m.group('sign') == '+' else -1
911 timezone = datetime.timedelta(
912 hours=sign * int(m.group('hours')),
913 minutes=sign * int(m.group('minutes')))
915 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
916 dt = datetime.datetime.strptime(date_str, date_format) - timezone
917 return calendar.timegm(dt.timetuple())
922 def unified_strdate(date_str, day_first=True):
923 """Return a string with the date in the format YYYYMMDD"""
929 date_str = date_str.replace(',', ' ')
930 # %z (UTC offset) is only supported in python>=3.2
931 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
932 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
933 # Remove AM/PM + timezone
934 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
936 format_expressions = [
949 '%Y-%m-%d %H:%M:%S.%f',
952 '%Y-%m-%dT%H:%M:%SZ',
953 '%Y-%m-%dT%H:%M:%S.%fZ',
954 '%Y-%m-%dT%H:%M:%S.%f0Z',
956 '%Y-%m-%dT%H:%M:%S.%f',
960 format_expressions.extend([
968 format_expressions.extend([
975 for expression in format_expressions:
977 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
980 if upload_date is None:
981 timetuple = email.utils.parsedate_tz(date_str)
983 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
984 if upload_date is not None:
985 return compat_str(upload_date)
988 def determine_ext(url, default_ext='unknown_video'):
991 guess = url.partition('?')[0].rpartition('.')[2]
992 if re.match(r'^[A-Za-z0-9]+$', guess):
994 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
995 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
996 return guess.rstrip('/')
1001 def subtitles_filename(filename, sub_lang, sub_format):
1002 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1005 def date_from_str(date_str):
1007 Return a datetime object from a string in the format YYYYMMDD or
1008 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1009 today = datetime.date.today()
1010 if date_str in ('now', 'today'):
1012 if date_str == 'yesterday':
1013 return today - datetime.timedelta(days=1)
1014 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1015 if match is not None:
1016 sign = match.group('sign')
1017 time = int(match.group('time'))
1020 unit = match.group('unit')
1021 # A bad approximation?
1025 elif unit == 'year':
1029 delta = datetime.timedelta(**{unit: time})
1030 return today + delta
1031 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1034 def hyphenate_date(date_str):
1036 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1037 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1038 if match is not None:
1039 return '-'.join(match.groups())
1044 class DateRange(object):
1045 """Represents a time interval between two dates"""
1047 def __init__(self, start=None, end=None):
1048 """start and end must be strings in the format accepted by date"""
1049 if start is not None:
1050 self.start = date_from_str(start)
1052 self.start = datetime.datetime.min.date()
1054 self.end = date_from_str(end)
1056 self.end = datetime.datetime.max.date()
1057 if self.start > self.end:
1058 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1062 """Returns a range that only contains the given day"""
1063 return cls(day, day)
1065 def __contains__(self, date):
1066 """Check if the date is in the range"""
1067 if not isinstance(date, datetime.date):
1068 date = date_from_str(date)
1069 return self.start <= date <= self.end
1072 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1075 def platform_name():
1076 """ Returns the platform name as a compat_str """
1077 res = platform.platform()
1078 if isinstance(res, bytes):
1079 res = res.decode(preferredencoding())
1081 assert isinstance(res, compat_str)
1085 def _windows_write_string(s, out):
1086 """ Returns True if the string was written using special methods,
1087 False if it has yet to be written out."""
1088 # Adapted from http://stackoverflow.com/a/3259271/35070
1091 import ctypes.wintypes
1099 fileno = out.fileno()
1100 except AttributeError:
1101 # If the output stream doesn't have a fileno, it's virtual
1103 except io.UnsupportedOperation:
1104 # Some strange Windows pseudo files?
1106 if fileno not in WIN_OUTPUT_IDS:
1109 GetStdHandle = ctypes.WINFUNCTYPE(
1110 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1111 (b'GetStdHandle', ctypes.windll.kernel32))
1112 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1114 WriteConsoleW = ctypes.WINFUNCTYPE(
1115 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1116 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1117 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1118 written = ctypes.wintypes.DWORD(0)
1120 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1121 FILE_TYPE_CHAR = 0x0002
1122 FILE_TYPE_REMOTE = 0x8000
1123 GetConsoleMode = ctypes.WINFUNCTYPE(
1124 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1125 ctypes.POINTER(ctypes.wintypes.DWORD))(
1126 (b'GetConsoleMode', ctypes.windll.kernel32))
1127 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1129 def not_a_console(handle):
1130 if handle == INVALID_HANDLE_VALUE or handle is None:
1132 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1133 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1135 if not_a_console(h):
1138 def next_nonbmp_pos(s):
1140 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1141 except StopIteration:
1145 count = min(next_nonbmp_pos(s), 1024)
1147 ret = WriteConsoleW(
1148 h, s, count if count else 2, ctypes.byref(written), None)
1150 raise OSError('Failed to write string')
1151 if not count: # We just wrote a non-BMP character
1152 assert written.value == 2
1155 assert written.value > 0
1156 s = s[written.value:]
1160 def write_string(s, out=None, encoding=None):
1163 assert type(s) == compat_str
1165 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1166 if _windows_write_string(s, out):
1169 if ('b' in getattr(out, 'mode', '') or
1170 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1171 byt = s.encode(encoding or preferredencoding(), 'ignore')
1173 elif hasattr(out, 'buffer'):
1174 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1175 byt = s.encode(enc, 'ignore')
1176 out.buffer.write(byt)
1182 def bytes_to_intlist(bs):
1185 if isinstance(bs[0], int): # Python 3
1188 return [ord(c) for c in bs]
1191 def intlist_to_bytes(xs):
1194 return struct_pack('%dB' % len(xs), *xs)
1197 # Cross-platform file locking
1198 if sys.platform == 'win32':
1199 import ctypes.wintypes
1202 class OVERLAPPED(ctypes.Structure):
1204 ('Internal', ctypes.wintypes.LPVOID),
1205 ('InternalHigh', ctypes.wintypes.LPVOID),
1206 ('Offset', ctypes.wintypes.DWORD),
1207 ('OffsetHigh', ctypes.wintypes.DWORD),
1208 ('hEvent', ctypes.wintypes.HANDLE),
1211 kernel32 = ctypes.windll.kernel32
1212 LockFileEx = kernel32.LockFileEx
1213 LockFileEx.argtypes = [
1214 ctypes.wintypes.HANDLE, # hFile
1215 ctypes.wintypes.DWORD, # dwFlags
1216 ctypes.wintypes.DWORD, # dwReserved
1217 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1218 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1219 ctypes.POINTER(OVERLAPPED) # Overlapped
1221 LockFileEx.restype = ctypes.wintypes.BOOL
1222 UnlockFileEx = kernel32.UnlockFileEx
1223 UnlockFileEx.argtypes = [
1224 ctypes.wintypes.HANDLE, # hFile
1225 ctypes.wintypes.DWORD, # dwReserved
1226 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1227 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1228 ctypes.POINTER(OVERLAPPED) # Overlapped
1230 UnlockFileEx.restype = ctypes.wintypes.BOOL
1231 whole_low = 0xffffffff
1232 whole_high = 0x7fffffff
1234 def _lock_file(f, exclusive):
1235 overlapped = OVERLAPPED()
1236 overlapped.Offset = 0
1237 overlapped.OffsetHigh = 0
1238 overlapped.hEvent = 0
1239 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1240 handle = msvcrt.get_osfhandle(f.fileno())
1241 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1242 whole_low, whole_high, f._lock_file_overlapped_p):
1243 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1245 def _unlock_file(f):
1246 assert f._lock_file_overlapped_p
1247 handle = msvcrt.get_osfhandle(f.fileno())
1248 if not UnlockFileEx(handle, 0,
1249 whole_low, whole_high, f._lock_file_overlapped_p):
1250 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1253 # Some platforms, such as Jython, is missing fcntl
1257 def _lock_file(f, exclusive):
1258 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1260 def _unlock_file(f):
1261 fcntl.flock(f, fcntl.LOCK_UN)
1263 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1265 def _lock_file(f, exclusive):
1266 raise IOError(UNSUPPORTED_MSG)
1268 def _unlock_file(f):
1269 raise IOError(UNSUPPORTED_MSG)
1272 class locked_file(object):
1273 def __init__(self, filename, mode, encoding=None):
1274 assert mode in ['r', 'a', 'w']
1275 self.f = io.open(filename, mode, encoding=encoding)
1278 def __enter__(self):
1279 exclusive = self.mode != 'r'
1281 _lock_file(self.f, exclusive)
1287 def __exit__(self, etype, value, traceback):
1289 _unlock_file(self.f)
1296 def write(self, *args):
1297 return self.f.write(*args)
1299 def read(self, *args):
1300 return self.f.read(*args)
1303 def get_filesystem_encoding():
1304 encoding = sys.getfilesystemencoding()
1305 return encoding if encoding is not None else 'utf-8'
1308 def shell_quote(args):
1310 encoding = get_filesystem_encoding()
1312 if isinstance(a, bytes):
1313 # We may get a filename encoded with 'encodeFilename'
1314 a = a.decode(encoding)
1315 quoted_args.append(pipes.quote(a))
1316 return ' '.join(quoted_args)
1319 def smuggle_url(url, data):
1320 """ Pass additional data in a URL for internal use. """
1322 sdata = compat_urllib_parse_urlencode(
1323 {'__youtubedl_smuggle': json.dumps(data)})
1324 return url + '#' + sdata
1327 def unsmuggle_url(smug_url, default=None):
1328 if '#__youtubedl_smuggle' not in smug_url:
1329 return smug_url, default
1330 url, _, sdata = smug_url.rpartition('#')
1331 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1332 data = json.loads(jsond)
1336 def format_bytes(bytes):
1339 if type(bytes) is str:
1340 bytes = float(bytes)
1344 exponent = int(math.log(bytes, 1024.0))
1345 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1346 converted = float(bytes) / float(1024 ** exponent)
1347 return '%.2f%s' % (converted, suffix)
1350 def lookup_unit_table(unit_table, s):
1351 units_re = '|'.join(re.escape(u) for u in unit_table)
1353 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1356 num_str = m.group('num').replace(',', '.')
1357 mult = unit_table[m.group('unit')]
1358 return int(float(num_str) * mult)
1361 def parse_filesize(s):
1365 # The lower-case forms are of course incorrect and unofficial,
1366 # but we support those too
1404 return lookup_unit_table(_UNIT_TABLE, s)
1413 if re.match(r'^[\d,.]+$', s):
1414 return str_to_int(s)
1425 return lookup_unit_table(_UNIT_TABLE, s)
1428 def month_by_name(name):
1429 """ Return the number of a month by (locale-independently) English name """
1432 return ENGLISH_MONTH_NAMES.index(name) + 1
1437 def month_by_abbreviation(abbrev):
1438 """ Return the number of a month by (locale-independently) English
1442 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1447 def fix_xml_ampersands(xml_str):
1448 """Replace all the '&' by '&' in XML"""
1450 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1455 def setproctitle(title):
1456 assert isinstance(title, compat_str)
1458 # ctypes in Jython is not complete
1459 # http://bugs.jython.org/issue2148
1460 if sys.platform.startswith('java'):
1464 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1467 title_bytes = title.encode('utf-8')
1468 buf = ctypes.create_string_buffer(len(title_bytes))
1469 buf.value = title_bytes
1471 libc.prctl(15, buf, 0, 0, 0)
1472 except AttributeError:
1473 return # Strange libc, just skip this
1476 def remove_start(s, start):
1477 if s.startswith(start):
1478 return s[len(start):]
1482 def remove_end(s, end):
1484 return s[:-len(end)]
1488 def remove_quotes(s):
1489 if s is None or len(s) < 2:
1491 for quote in ('"', "'", ):
1492 if s[0] == quote and s[-1] == quote:
1497 def url_basename(url):
1498 path = compat_urlparse.urlparse(url).path
1499 return path.strip('/').split('/')[-1]
1502 class HEADRequest(compat_urllib_request.Request):
1503 def get_method(self):
1507 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1510 v = getattr(v, get_attr, None)
1516 return int(v) * invscale // scale
1521 def str_or_none(v, default=None):
1522 return default if v is None else compat_str(v)
1525 def str_to_int(int_str):
1526 """ A more relaxed version of int_or_none """
1529 int_str = re.sub(r'[,\.\+]', '', int_str)
1533 def float_or_none(v, scale=1, invscale=1, default=None):
1537 return float(v) * invscale / scale
1542 def parse_duration(s):
1543 if not isinstance(s, compat_basestring):
1551 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1552 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1554 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1557 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1558 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1560 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1562 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1567 if m.group('only_mins'):
1568 return float_or_none(m.group('only_mins'), invscale=60)
1569 if m.group('only_hours'):
1570 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1572 res += int(m.group('secs'))
1573 if m.group('mins_reversed'):
1574 res += int(m.group('mins_reversed')) * 60
1576 res += int(m.group('mins')) * 60
1577 if m.group('hours'):
1578 res += int(m.group('hours')) * 60 * 60
1579 if m.group('hours_reversed'):
1580 res += int(m.group('hours_reversed')) * 60 * 60
1582 res += int(m.group('days')) * 24 * 60 * 60
1584 res += float(m.group('ms'))
1588 def prepend_extension(filename, ext, expected_real_ext=None):
1589 name, real_ext = os.path.splitext(filename)
1591 '{0}.{1}{2}'.format(name, ext, real_ext)
1592 if not expected_real_ext or real_ext[1:] == expected_real_ext
1593 else '{0}.{1}'.format(filename, ext))
1596 def replace_extension(filename, ext, expected_real_ext=None):
1597 name, real_ext = os.path.splitext(filename)
1598 return '{0}.{1}'.format(
1599 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1603 def check_executable(exe, args=[]):
1604 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1605 args can be a list of arguments for a short output (like -version) """
1607 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1613 def get_exe_version(exe, args=['--version'],
1614 version_re=None, unrecognized='present'):
1615 """ Returns the version of the specified executable,
1616 or False if the executable is not present """
1618 out, _ = subprocess.Popen(
1619 [encodeArgument(exe)] + args,
1620 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1623 if isinstance(out, bytes): # Python 2.x
1624 out = out.decode('ascii', 'ignore')
1625 return detect_exe_version(out, version_re, unrecognized)
1628 def detect_exe_version(output, version_re=None, unrecognized='present'):
1629 assert isinstance(output, compat_str)
1630 if version_re is None:
1631 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1632 m = re.search(version_re, output)
1639 class PagedList(object):
1641 # This is only useful for tests
1642 return len(self.getslice())
1645 class OnDemandPagedList(PagedList):
1646 def __init__(self, pagefunc, pagesize, use_cache=False):
1647 self._pagefunc = pagefunc
1648 self._pagesize = pagesize
1649 self._use_cache = use_cache
1653 def getslice(self, start=0, end=None):
1655 for pagenum in itertools.count(start // self._pagesize):
1656 firstid = pagenum * self._pagesize
1657 nextfirstid = pagenum * self._pagesize + self._pagesize
1658 if start >= nextfirstid:
1663 page_results = self._cache.get(pagenum)
1664 if page_results is None:
1665 page_results = list(self._pagefunc(pagenum))
1667 self._cache[pagenum] = page_results
1670 start % self._pagesize
1671 if firstid <= start < nextfirstid
1675 ((end - 1) % self._pagesize) + 1
1676 if (end is not None and firstid <= end <= nextfirstid)
1679 if startv != 0 or endv is not None:
1680 page_results = page_results[startv:endv]
1681 res.extend(page_results)
1683 # A little optimization - if current page is not "full", ie. does
1684 # not contain page_size videos then we can assume that this page
1685 # is the last one - there are no more ids on further pages -
1686 # i.e. no need to query again.
1687 if len(page_results) + startv < self._pagesize:
1690 # If we got the whole page, but the next page is not interesting,
1691 # break out early as well
1692 if end == nextfirstid:
1697 class InAdvancePagedList(PagedList):
1698 def __init__(self, pagefunc, pagecount, pagesize):
1699 self._pagefunc = pagefunc
1700 self._pagecount = pagecount
1701 self._pagesize = pagesize
1703 def getslice(self, start=0, end=None):
1705 start_page = start // self._pagesize
1707 self._pagecount if end is None else (end // self._pagesize + 1))
1708 skip_elems = start - start_page * self._pagesize
1709 only_more = None if end is None else end - start
1710 for pagenum in range(start_page, end_page):
1711 page = list(self._pagefunc(pagenum))
1713 page = page[skip_elems:]
1715 if only_more is not None:
1716 if len(page) < only_more:
1717 only_more -= len(page)
1719 page = page[:only_more]
1726 def uppercase_escape(s):
1727 unicode_escape = codecs.getdecoder('unicode_escape')
1729 r'\\U[0-9a-fA-F]{8}',
1730 lambda m: unicode_escape(m.group(0))[0],
1734 def lowercase_escape(s):
1735 unicode_escape = codecs.getdecoder('unicode_escape')
1737 r'\\u[0-9a-fA-F]{4}',
1738 lambda m: unicode_escape(m.group(0))[0],
1742 def escape_rfc3986(s):
1743 """Escape non-ASCII characters as suggested by RFC 3986"""
1744 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1745 s = s.encode('utf-8')
1746 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1749 def escape_url(url):
1750 """Escape URL as suggested by RFC 3986"""
1751 url_parsed = compat_urllib_parse_urlparse(url)
1752 return url_parsed._replace(
1753 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1754 path=escape_rfc3986(url_parsed.path),
1755 params=escape_rfc3986(url_parsed.params),
1756 query=escape_rfc3986(url_parsed.query),
1757 fragment=escape_rfc3986(url_parsed.fragment)
1761 struct.pack('!I', 0)
1763 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1764 # See https://bugs.python.org/issue19099
1765 def struct_pack(spec, *args):
1766 if isinstance(spec, compat_str):
1767 spec = spec.encode('ascii')
1768 return struct.pack(spec, *args)
1770 def struct_unpack(spec, *args):
1771 if isinstance(spec, compat_str):
1772 spec = spec.encode('ascii')
1773 return struct.unpack(spec, *args)
1775 struct_pack = struct.pack
1776 struct_unpack = struct.unpack
1779 def read_batch_urls(batch_fd):
1781 if not isinstance(url, compat_str):
1782 url = url.decode('utf-8', 'replace')
1783 BOM_UTF8 = '\xef\xbb\xbf'
1784 if url.startswith(BOM_UTF8):
1785 url = url[len(BOM_UTF8):]
1787 if url.startswith(('#', ';', ']')):
1791 with contextlib.closing(batch_fd) as fd:
1792 return [url for url in map(fixup, fd) if url]
1795 def urlencode_postdata(*args, **kargs):
1796 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1799 def update_url_query(url, query):
1800 parsed_url = compat_urlparse.urlparse(url)
1801 qs = compat_parse_qs(parsed_url.query)
1803 return compat_urlparse.urlunparse(parsed_url._replace(
1804 query=compat_urllib_parse_urlencode(qs, True)))
1807 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1808 if isinstance(key_or_keys, (list, tuple)):
1809 for key in key_or_keys:
1810 if key not in d or d[key] is None or skip_false_values and not d[key]:
1814 return d.get(key_or_keys, default)
1817 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1818 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1830 def parse_age_limit(s):
1833 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1834 return int(m.group('age')) if m else US_RATINGS.get(s)
1837 def strip_jsonp(code):
1839 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1842 def js_to_json(code):
1845 if v in ('true', 'false', 'null'):
1847 if v.startswith('"'):
1848 v = re.sub(r"\\'", "'", v[1:-1])
1849 elif v.startswith("'"):
1851 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1858 res = re.sub(r'''(?x)
1859 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1860 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1861 [a-zA-Z_][.a-zA-Z_0-9]*
1863 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1867 def qualities(quality_ids):
1868 """ Get a numeric quality value out of a list of possible values """
1871 return quality_ids.index(qid)
1877 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1880 def limit_length(s, length):
1881 """ Add ellipses to overly long strings """
1886 return s[:length - len(ELLIPSES)] + ELLIPSES
1890 def version_tuple(v):
1891 return tuple(int(e) for e in re.split(r'[-.]', v))
1894 def is_outdated_version(version, limit, assume_new=True):
1896 return not assume_new
1898 return version_tuple(version) < version_tuple(limit)
1900 return not assume_new
1903 def ytdl_is_updateable():
1904 """ Returns if youtube-dl can be updated with -U """
1905 from zipimport import zipimporter
1907 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1910 def args_to_str(args):
1911 # Get a short string representation for a subprocess command
1912 return ' '.join(shlex_quote(a) for a in args)
1915 def error_to_compat_str(err):
1917 # On python 2 error byte string must be decoded with proper
1918 # encoding rather than ascii
1919 if sys.version_info[0] < 3:
1920 err_str = err_str.decode(preferredencoding())
1924 def mimetype2ext(mt):
1931 _, _, res = mt.rpartition('/')
1935 'smptett+xml': 'tt',
1941 'x-mp4-fragmented': 'mp4',
1946 def urlhandle_detect_ext(url_handle):
1949 getheader = lambda h: url_handle.headers[h]
1950 except AttributeError: # Python < 3
1951 getheader = url_handle.info().getheader
1953 cd = getheader('Content-Disposition')
1955 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1957 e = determine_ext(m.group('filename'), default_ext=None)
1961 return mimetype2ext(getheader('Content-Type'))
1964 def encode_data_uri(data, mime_type):
1965 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1968 def age_restricted(content_limit, age_limit):
1969 """ Returns True iff the content should be blocked """
1971 if age_limit is None: # No limit set
1973 if content_limit is None:
1974 return False # Content available for everyone
1975 return age_limit < content_limit
1978 def is_html(first_bytes):
1979 """ Detect whether a file contains HTML by examining its first bytes. """
1982 (b'\xef\xbb\xbf', 'utf-8'),
1983 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1984 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1985 (b'\xff\xfe', 'utf-16-le'),
1986 (b'\xfe\xff', 'utf-16-be'),
1988 for bom, enc in BOMS:
1989 if first_bytes.startswith(bom):
1990 s = first_bytes[len(bom):].decode(enc, 'replace')
1993 s = first_bytes.decode('utf-8', 'replace')
1995 return re.match(r'^\s*<', s)
1998 def determine_protocol(info_dict):
1999 protocol = info_dict.get('protocol')
2000 if protocol is not None:
2003 url = info_dict['url']
2004 if url.startswith('rtmp'):
2006 elif url.startswith('mms'):
2008 elif url.startswith('rtsp'):
2011 ext = determine_ext(url)
2017 return compat_urllib_parse_urlparse(url).scheme
2020 def render_table(header_row, data):
2021 """ Render a list of rows, each as a list of values """
2022 table = [header_row] + data
2023 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2024 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2025 return '\n'.join(format_str % tuple(row) for row in table)
2028 def _match_one(filter_part, dct):
2029 COMPARISON_OPERATORS = {
2037 operator_rex = re.compile(r'''(?x)\s*
2039 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2041 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2042 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2045 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2046 m = operator_rex.search(filter_part)
2048 op = COMPARISON_OPERATORS[m.group('op')]
2049 if m.group('strval') is not None:
2050 if m.group('op') not in ('=', '!='):
2052 'Operator %s does not support string values!' % m.group('op'))
2053 comparison_value = m.group('strval')
2056 comparison_value = int(m.group('intval'))
2058 comparison_value = parse_filesize(m.group('intval'))
2059 if comparison_value is None:
2060 comparison_value = parse_filesize(m.group('intval') + 'B')
2061 if comparison_value is None:
2063 'Invalid integer value %r in filter part %r' % (
2064 m.group('intval'), filter_part))
2065 actual_value = dct.get(m.group('key'))
2066 if actual_value is None:
2067 return m.group('none_inclusive')
2068 return op(actual_value, comparison_value)
2071 '': lambda v: v is not None,
2072 '!': lambda v: v is None,
2074 operator_rex = re.compile(r'''(?x)\s*
2075 (?P<op>%s)\s*(?P<key>[a-z_]+)
2077 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2078 m = operator_rex.search(filter_part)
2080 op = UNARY_OPERATORS[m.group('op')]
2081 actual_value = dct.get(m.group('key'))
2082 return op(actual_value)
2084 raise ValueError('Invalid filter part %r' % filter_part)
2087 def match_str(filter_str, dct):
2088 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2091 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2094 def match_filter_func(filter_str):
2095 def _match_func(info_dict):
2096 if match_str(filter_str, info_dict):
2099 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2100 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2104 def parse_dfxp_time_expr(time_expr):
2108 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2110 return float(mobj.group('time_offset'))
2112 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2114 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2117 def srt_subtitles_timecode(seconds):
2118 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2121 def dfxp2srt(dfxp_data):
2122 _x = functools.partial(xpath_with_ns, ns_map={
2123 'ttml': 'http://www.w3.org/ns/ttml',
2124 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2127 class TTMLPElementParser(object):
2130 def start(self, tag, attrib):
2131 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2137 def data(self, data):
2141 return self.out.strip()
2143 def parse_node(node):
2144 target = TTMLPElementParser()
2145 parser = xml.etree.ElementTree.XMLParser(target=target)
2146 parser.feed(xml.etree.ElementTree.tostring(node))
2147 return parser.close()
2149 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2151 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2154 raise ValueError('Invalid dfxp/TTML subtitle')
2156 for para, index in zip(paras, itertools.count(1)):
2157 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2158 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2159 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2160 if begin_time is None:
2165 end_time = begin_time + dur
2166 out.append('%d\n%s --> %s\n%s\n\n' % (
2168 srt_subtitles_timecode(begin_time),
2169 srt_subtitles_timecode(end_time),
2175 def cli_option(params, command_option, param):
2176 param = params.get(param)
2177 return [command_option, param] if param is not None else []
2180 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2181 param = params.get(param)
2182 assert isinstance(param, bool)
2184 return [command_option + separator + (true_value if param else false_value)]
2185 return [command_option, true_value if param else false_value]
2188 def cli_valueless_option(params, command_option, param, expected_value=True):
2189 param = params.get(param)
2190 return [command_option] if param == expected_value else []
2193 def cli_configuration_args(params, param, default=[]):
2194 ex_args = params.get(param)
2197 assert isinstance(ex_args, list)
2201 class ISO639Utils(object):
2202 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2391 def short2long(cls, code):
2392 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2393 return cls._lang_map.get(code[:2])
2396 def long2short(cls, code):
2397 """Convert language code from ISO 639-2/T to ISO 639-1"""
2398 for short_name, long_name in cls._lang_map.items():
2399 if long_name == code:
2403 class ISO3166Utils(object):
2404 # From http://data.okfn.org/data/core/country-list
2406 'AF': 'Afghanistan',
2407 'AX': 'Ã…land Islands',
2410 'AS': 'American Samoa',
2415 'AG': 'Antigua and Barbuda',
2432 'BO': 'Bolivia, Plurinational State of',
2433 'BQ': 'Bonaire, Sint Eustatius and Saba',
2434 'BA': 'Bosnia and Herzegovina',
2436 'BV': 'Bouvet Island',
2438 'IO': 'British Indian Ocean Territory',
2439 'BN': 'Brunei Darussalam',
2441 'BF': 'Burkina Faso',
2447 'KY': 'Cayman Islands',
2448 'CF': 'Central African Republic',
2452 'CX': 'Christmas Island',
2453 'CC': 'Cocos (Keeling) Islands',
2457 'CD': 'Congo, the Democratic Republic of the',
2458 'CK': 'Cook Islands',
2460 'CI': 'Côte d\'Ivoire',
2465 'CZ': 'Czech Republic',
2469 'DO': 'Dominican Republic',
2472 'SV': 'El Salvador',
2473 'GQ': 'Equatorial Guinea',
2477 'FK': 'Falkland Islands (Malvinas)',
2478 'FO': 'Faroe Islands',
2482 'GF': 'French Guiana',
2483 'PF': 'French Polynesia',
2484 'TF': 'French Southern Territories',
2499 'GW': 'Guinea-Bissau',
2502 'HM': 'Heard Island and McDonald Islands',
2503 'VA': 'Holy See (Vatican City State)',
2510 'IR': 'Iran, Islamic Republic of',
2513 'IM': 'Isle of Man',
2523 'KP': 'Korea, Democratic People\'s Republic of',
2524 'KR': 'Korea, Republic of',
2527 'LA': 'Lao People\'s Democratic Republic',
2533 'LI': 'Liechtenstein',
2537 'MK': 'Macedonia, the Former Yugoslav Republic of',
2544 'MH': 'Marshall Islands',
2550 'FM': 'Micronesia, Federated States of',
2551 'MD': 'Moldova, Republic of',
2562 'NL': 'Netherlands',
2563 'NC': 'New Caledonia',
2564 'NZ': 'New Zealand',
2569 'NF': 'Norfolk Island',
2570 'MP': 'Northern Mariana Islands',
2575 'PS': 'Palestine, State of',
2577 'PG': 'Papua New Guinea',
2580 'PH': 'Philippines',
2584 'PR': 'Puerto Rico',
2588 'RU': 'Russian Federation',
2590 'BL': 'Saint Barthélemy',
2591 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2592 'KN': 'Saint Kitts and Nevis',
2593 'LC': 'Saint Lucia',
2594 'MF': 'Saint Martin (French part)',
2595 'PM': 'Saint Pierre and Miquelon',
2596 'VC': 'Saint Vincent and the Grenadines',
2599 'ST': 'Sao Tome and Principe',
2600 'SA': 'Saudi Arabia',
2604 'SL': 'Sierra Leone',
2606 'SX': 'Sint Maarten (Dutch part)',
2609 'SB': 'Solomon Islands',
2611 'ZA': 'South Africa',
2612 'GS': 'South Georgia and the South Sandwich Islands',
2613 'SS': 'South Sudan',
2618 'SJ': 'Svalbard and Jan Mayen',
2621 'CH': 'Switzerland',
2622 'SY': 'Syrian Arab Republic',
2623 'TW': 'Taiwan, Province of China',
2625 'TZ': 'Tanzania, United Republic of',
2627 'TL': 'Timor-Leste',
2631 'TT': 'Trinidad and Tobago',
2634 'TM': 'Turkmenistan',
2635 'TC': 'Turks and Caicos Islands',
2639 'AE': 'United Arab Emirates',
2640 'GB': 'United Kingdom',
2641 'US': 'United States',
2642 'UM': 'United States Minor Outlying Islands',
2646 'VE': 'Venezuela, Bolivarian Republic of',
2648 'VG': 'Virgin Islands, British',
2649 'VI': 'Virgin Islands, U.S.',
2650 'WF': 'Wallis and Futuna',
2651 'EH': 'Western Sahara',
2658 def short2full(cls, code):
2659 """Convert an ISO 3166-2 country code to the corresponding full name"""
2660 return cls._country_map.get(code.upper())
2663 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2664 def __init__(self, proxies=None):
2665 # Set default handlers
2666 for type in ('http', 'https'):
2667 setattr(self, '%s_open' % type,
2668 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2669 meth(r, proxy, type))
2670 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2672 def proxy_open(self, req, proxy, type):
2673 req_proxy = req.headers.get('Ytdl-request-proxy')
2674 if req_proxy is not None:
2676 del req.headers['Ytdl-request-proxy']
2678 if proxy == '__noproxy__':
2679 return None # No Proxy
2680 return compat_urllib_request.ProxyHandler.proxy_open(
2681 self, req, proxy, type)
2684 def ohdave_rsa_encrypt(data, exponent, modulus):
2686 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2689 data: data to encrypt, bytes-like object
2690 exponent, modulus: parameter e and N of RSA algorithm, both integer
2691 Output: hex string of encrypted data
2693 Limitation: supports one block encryption only
2696 payload = int(binascii.hexlify(data[::-1]), 16)
2697 encrypted = pow(payload, exponent, modulus)
2698 return '%x' % encrypted
2701 def encode_base_n(num, n, table=None):
2702 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2704 table = FULL_TABLE[:n]
2707 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2714 ret = table[num % n] + ret
2719 def decode_packed_codes(code):
2721 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2723 obfucasted_code, base, count, symbols = mobj.groups()
2726 symbols = symbols.split('|')
2731 base_n_count = encode_base_n(count, base)
2732 symbol_table[base_n_count] = symbols[count] or base_n_count
2735 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],