2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
40 compat_etree_fromstring,
45 compat_socket_create_connection,
49 compat_urllib_parse_urlparse,
50 compat_urllib_request,
56 # This is not clearly defined otherwise
57 compiled_regex_type = type(re.compile(''))
60 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
61 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
62 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
63 'Accept-Encoding': 'gzip, deflate',
64 'Accept-Language': 'en-us,en;q=0.5',
70 ENGLISH_MONTH_NAMES = [
71 'January', 'February', 'March', 'April', 'May', 'June',
72 'July', 'August', 'September', 'October', 'November', 'December']
75 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
76 'flv', 'f4v', 'f4a', 'f4b',
77 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
87 'f4f', 'f4m', 'm3u8', 'smil')
90 def preferredencoding():
91 """Get preferred encoding.
93 Returns the best encoding scheme for the system, based on
94 locale.getpreferredencoding() and some further tweaks.
97 pref = locale.getpreferredencoding()
105 def write_json_file(obj, fn):
106 """ Encode obj as JSON and write it to fn, atomically if possible """
108 fn = encodeFilename(fn)
109 if sys.version_info < (3, 0) and sys.platform != 'win32':
110 encoding = get_filesystem_encoding()
111 # os.path.basename returns a bytes object, but NamedTemporaryFile
112 # will fail if the filename contains non ascii characters unless we
113 # use a unicode object
114 path_basename = lambda f: os.path.basename(fn).decode(encoding)
115 # the same for os.path.dirname
116 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
118 path_basename = os.path.basename
119 path_dirname = os.path.dirname
123 'prefix': path_basename(fn) + '.',
124 'dir': path_dirname(fn),
128 # In Python 2.x, json.dump expects a bytestream.
129 # In Python 3.x, it writes to a character stream
130 if sys.version_info < (3, 0):
138 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
143 if sys.platform == 'win32':
144 # Need to remove existing file on Windows, else os.rename raises
145 # WindowsError or FileExistsError.
150 os.rename(tf.name, fn)
159 if sys.version_info >= (2, 7):
160 def find_xpath_attr(node, xpath, key, val=None):
161 """ Find the xpath xpath[@key=val] """
162 assert re.match(r'^[a-zA-Z_-]+$', key)
163 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
164 return node.find(expr)
166 def find_xpath_attr(node, xpath, key, val=None):
167 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
168 # .//node does not match if a node is a direct child of . !
169 if isinstance(xpath, compat_str):
170 xpath = xpath.encode('ascii')
172 for f in node.findall(xpath):
173 if key not in f.attrib:
175 if val is None or f.attrib.get(key) == val:
179 # On python2.6 the xml.etree.ElementTree.Element methods don't support
180 # the namespace parameter
183 def xpath_with_ns(path, ns_map):
184 components = [c.split(':') for c in path.split('/')]
188 replaced.append(c[0])
191 replaced.append('{%s}%s' % (ns_map[ns], tag))
192 return '/'.join(replaced)
195 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
196 def _find_xpath(xpath):
197 if sys.version_info < (2, 7): # Crazy 2.6
198 xpath = xpath.encode('ascii')
199 return node.find(xpath)
201 if isinstance(xpath, (str, compat_str)):
202 n = _find_xpath(xpath)
210 if default is not NO_DEFAULT:
213 name = xpath if name is None else name
214 raise ExtractorError('Could not find XML element %s' % name)
220 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
221 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
222 if n is None or n == default:
225 if default is not NO_DEFAULT:
228 name = xpath if name is None else name
229 raise ExtractorError('Could not find XML element\'s text %s' % name)
235 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
236 n = find_xpath_attr(node, xpath, key)
238 if default is not NO_DEFAULT:
241 name = '%s[@%s]' % (xpath, key) if name is None else name
242 raise ExtractorError('Could not find XML attribute %s' % name)
248 def get_element_by_id(id, html):
249 """Return the content of the tag with the specified ID in the passed HTML document"""
250 return get_element_by_attribute('id', id, html)
253 def get_element_by_attribute(attribute, value, html):
254 """Return the content of the tag with the specified attribute in the passed HTML document"""
256 m = re.search(r'''(?xs)
258 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
260 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
264 ''' % (re.escape(attribute), re.escape(value)), html)
268 res = m.group('content')
270 if res.startswith('"') or res.startswith("'"):
273 return unescapeHTML(res)
276 def clean_html(html):
277 """Clean an HTML snippet into a readable string"""
279 if html is None: # Convenience for sanitizing descriptions etc.
283 html = html.replace('\n', ' ')
284 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
285 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
287 html = re.sub('<.*?>', '', html)
288 # Replace html entities
289 html = unescapeHTML(html)
293 def sanitize_open(filename, open_mode):
294 """Try to open the given filename, and slightly tweak it if this fails.
296 Attempts to open the given filename. If this fails, it tries to change
297 the filename slightly, step by step, until it's either able to open it
298 or it fails and raises a final exception, like the standard open()
301 It returns the tuple (stream, definitive_file_name).
305 if sys.platform == 'win32':
307 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
308 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
309 stream = open(encodeFilename(filename), open_mode)
310 return (stream, filename)
311 except (IOError, OSError) as err:
312 if err.errno in (errno.EACCES,):
315 # In case of error, try to remove win32 forbidden chars
316 alt_filename = sanitize_path(filename)
317 if alt_filename == filename:
320 # An exception here should be caught in the caller
321 stream = open(encodeFilename(alt_filename), open_mode)
322 return (stream, alt_filename)
325 def timeconvert(timestr):
326 """Convert RFC 2822 defined time string into system timestamp"""
328 timetuple = email.utils.parsedate_tz(timestr)
329 if timetuple is not None:
330 timestamp = email.utils.mktime_tz(timetuple)
334 def sanitize_filename(s, restricted=False, is_id=False):
335 """Sanitizes a string so it could be used as part of a filename.
336 If restricted is set, use a stricter subset of allowed characters.
337 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
339 def replace_insane(char):
340 if char == '?' or ord(char) < 32 or ord(char) == 127:
343 return '' if restricted else '\''
345 return '_-' if restricted else ' -'
346 elif char in '\\/|*<>':
348 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
350 if restricted and ord(char) > 127:
355 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
356 result = ''.join(map(replace_insane, s))
358 while '__' in result:
359 result = result.replace('__', '_')
360 result = result.strip('_')
361 # Common case of "Foreign band name - English song title"
362 if restricted and result.startswith('-_'):
364 if result.startswith('-'):
365 result = '_' + result[len('-'):]
366 result = result.lstrip('.')
372 def sanitize_path(s):
373 """Sanitizes and normalizes path on Windows"""
374 if sys.platform != 'win32':
376 drive_or_unc, _ = os.path.splitdrive(s)
377 if sys.version_info < (2, 7) and not drive_or_unc:
378 drive_or_unc, _ = os.path.splitunc(s)
379 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
383 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
384 for path_part in norm_path]
386 sanitized_path.insert(0, drive_or_unc + os.path.sep)
387 return os.path.join(*sanitized_path)
390 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
391 # unwanted failures due to missing protocol
392 def sanitized_Request(url, *args, **kwargs):
393 return compat_urllib_request.Request(
394 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
397 def orderedSet(iterable):
398 """ Remove all duplicates from the input iterable """
406 def _htmlentity_transform(entity):
407 """Transforms an HTML entity to a character."""
408 # Known non-numeric HTML entity
409 if entity in compat_html_entities.name2codepoint:
410 return compat_chr(compat_html_entities.name2codepoint[entity])
412 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
414 numstr = mobj.group(1)
415 if numstr.startswith('x'):
417 numstr = '0%s' % numstr
420 # See https://github.com/rg3/youtube-dl/issues/7518
422 return compat_chr(int(numstr, base))
426 # Unknown entity in name, return its literal representation
427 return '&%s;' % entity
433 assert type(s) == compat_str
436 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
439 def get_subprocess_encoding():
440 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
441 # For subprocess calls, encode with locale encoding
442 # Refer to http://stackoverflow.com/a/9951851/35070
443 encoding = preferredencoding()
445 encoding = sys.getfilesystemencoding()
451 def encodeFilename(s, for_subprocess=False):
453 @param s The name of the file
456 assert type(s) == compat_str
458 # Python 3 has a Unicode API
459 if sys.version_info >= (3, 0):
462 # Pass '' directly to use Unicode APIs on Windows 2000 and up
463 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
464 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
465 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
468 return s.encode(get_subprocess_encoding(), 'ignore')
471 def decodeFilename(b, for_subprocess=False):
473 if sys.version_info >= (3, 0):
476 if not isinstance(b, bytes):
479 return b.decode(get_subprocess_encoding(), 'ignore')
482 def encodeArgument(s):
483 if not isinstance(s, compat_str):
484 # Legacy code that uses byte strings
485 # Uncomment the following line after fixing all post processors
486 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
487 s = s.decode('ascii')
488 return encodeFilename(s, True)
491 def decodeArgument(b):
492 return decodeFilename(b, True)
495 def decodeOption(optval):
498 if isinstance(optval, bytes):
499 optval = optval.decode(preferredencoding())
501 assert isinstance(optval, compat_str)
505 def formatSeconds(secs):
507 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
509 return '%d:%02d' % (secs // 60, secs % 60)
514 def make_HTTPS_handler(params, **kwargs):
515 opts_no_check_certificate = params.get('nocheckcertificate', False)
516 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
517 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
518 if opts_no_check_certificate:
519 context.check_hostname = False
520 context.verify_mode = ssl.CERT_NONE
522 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
525 # (create_default_context present but HTTPSHandler has no context=)
528 if sys.version_info < (3, 2):
529 return YoutubeDLHTTPSHandler(params, **kwargs)
531 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
532 context.verify_mode = (ssl.CERT_NONE
533 if opts_no_check_certificate
534 else ssl.CERT_REQUIRED)
535 context.set_default_verify_paths()
536 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
539 def bug_reports_message():
540 if ytdl_is_updateable():
541 update_cmd = 'type youtube-dl -U to update'
543 update_cmd = 'see https://yt-dl.org/update on how to update'
544 msg = '; please report this issue on https://yt-dl.org/bug .'
545 msg += ' Make sure you are using the latest version; %s.' % update_cmd
546 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
550 class ExtractorError(Exception):
551 """Error during info extraction."""
553 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
554 """ tb, if given, is the original traceback (so that it can be printed out).
555 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
558 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
560 if video_id is not None:
561 msg = video_id + ': ' + msg
563 msg += ' (caused by %r)' % cause
565 msg += bug_reports_message()
566 super(ExtractorError, self).__init__(msg)
569 self.exc_info = sys.exc_info() # preserve original exception
571 self.video_id = video_id
573 def format_traceback(self):
574 if self.traceback is None:
576 return ''.join(traceback.format_tb(self.traceback))
579 class UnsupportedError(ExtractorError):
580 def __init__(self, url):
581 super(UnsupportedError, self).__init__(
582 'Unsupported URL: %s' % url, expected=True)
586 class RegexNotFoundError(ExtractorError):
587 """Error when a regex didn't match"""
591 class DownloadError(Exception):
592 """Download Error exception.
594 This exception may be thrown by FileDownloader objects if they are not
595 configured to continue on errors. They will contain the appropriate
599 def __init__(self, msg, exc_info=None):
600 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
601 super(DownloadError, self).__init__(msg)
602 self.exc_info = exc_info
605 class SameFileError(Exception):
606 """Same File exception.
608 This exception will be thrown by FileDownloader objects if they detect
609 multiple files would have to be downloaded to the same file on disk.
614 class PostProcessingError(Exception):
615 """Post Processing exception.
617 This exception may be raised by PostProcessor's .run() method to
618 indicate an error in the postprocessing task.
621 def __init__(self, msg):
625 class MaxDownloadsReached(Exception):
626 """ --max-downloads limit has been reached. """
630 class UnavailableVideoError(Exception):
631 """Unavailable Format exception.
633 This exception will be thrown when a video is requested
634 in a format that is not available for that video.
639 class ContentTooShortError(Exception):
640 """Content Too Short exception.
642 This exception may be raised by FileDownloader objects when a file they
643 download is too small for what the server announced first, indicating
644 the connection was probably interrupted.
647 def __init__(self, downloaded, expected):
649 self.downloaded = downloaded
650 self.expected = expected
653 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
654 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
655 # expected HTTP responses to meet HTTP/1.0 or later (see also
656 # https://github.com/rg3/youtube-dl/issues/6727)
657 if sys.version_info < (3, 0):
658 kwargs[b'strict'] = True
659 hc = http_class(*args, **kwargs)
660 source_address = ydl_handler._params.get('source_address')
661 if source_address is not None:
662 sa = (source_address, 0)
663 if hasattr(hc, 'source_address'): # Python 2.7+
664 hc.source_address = sa
666 def _hc_connect(self, *args, **kwargs):
667 sock = compat_socket_create_connection(
668 (self.host, self.port), self.timeout, sa)
670 self.sock = ssl.wrap_socket(
671 sock, self.key_file, self.cert_file,
672 ssl_version=ssl.PROTOCOL_TLSv1)
675 hc.connect = functools.partial(_hc_connect, hc)
680 def handle_youtubedl_headers(headers):
681 filtered_headers = headers
683 if 'Youtubedl-no-compression' in filtered_headers:
684 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
685 del filtered_headers['Youtubedl-no-compression']
687 return filtered_headers
690 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
691 """Handler for HTTP requests and responses.
693 This class, when installed with an OpenerDirector, automatically adds
694 the standard headers to every HTTP request and handles gzipped and
695 deflated responses from web servers. If compression is to be avoided in
696 a particular request, the original request in the program code only has
697 to include the HTTP header "Youtubedl-no-compression", which will be
698 removed before making the real request.
700 Part of this code was copied from:
702 http://techknack.net/python-urllib2-handlers/
704 Andrew Rowls, the author of that code, agreed to release it to the
708 def __init__(self, params, *args, **kwargs):
709 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
710 self._params = params
712 def http_open(self, req):
713 return self.do_open(functools.partial(
714 _create_http_connection, self, compat_http_client.HTTPConnection, False),
720 return zlib.decompress(data, -zlib.MAX_WBITS)
722 return zlib.decompress(data)
725 def addinfourl_wrapper(stream, headers, url, code):
726 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
727 return compat_urllib_request.addinfourl(stream, headers, url, code)
728 ret = compat_urllib_request.addinfourl(stream, headers, url)
732 def http_request(self, req):
733 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
734 # always respected by websites, some tend to give out URLs with non percent-encoded
735 # non-ASCII characters (see telemb.py, ard.py [#3412])
736 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
737 # To work around aforementioned issue we will replace request's original URL with
738 # percent-encoded one
739 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
740 # the code of this workaround has been moved here from YoutubeDL.urlopen()
741 url = req.get_full_url()
742 url_escaped = escape_url(url)
744 # Substitute URL if any change after escaping
745 if url != url_escaped:
746 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
748 url_escaped, data=req.data, headers=req.headers,
749 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
750 new_req.timeout = req.timeout
753 for h, v in std_headers.items():
754 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
755 # The dict keys are capitalized because of this bug by urllib
756 if h.capitalize() not in req.headers:
759 req.headers = handle_youtubedl_headers(req.headers)
761 if sys.version_info < (2, 7) and '#' in req.get_full_url():
762 # Python 2.6 is brain-dead when it comes to fragments
763 req._Request__original = req._Request__original.partition('#')[0]
764 req._Request__r_type = req._Request__r_type.partition('#')[0]
768 def http_response(self, req, resp):
771 if resp.headers.get('Content-encoding', '') == 'gzip':
772 content = resp.read()
773 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
775 uncompressed = io.BytesIO(gz.read())
776 except IOError as original_ioerror:
777 # There may be junk add the end of the file
778 # See http://stackoverflow.com/q/4928560/35070 for details
779 for i in range(1, 1024):
781 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
782 uncompressed = io.BytesIO(gz.read())
787 raise original_ioerror
788 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
789 resp.msg = old_resp.msg
790 del resp.headers['Content-encoding']
792 if resp.headers.get('Content-encoding', '') == 'deflate':
793 gz = io.BytesIO(self.deflate(resp.read()))
794 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
795 resp.msg = old_resp.msg
796 del resp.headers['Content-encoding']
797 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
798 # https://github.com/rg3/youtube-dl/issues/6457).
799 if 300 <= resp.code < 400:
800 location = resp.headers.get('Location')
802 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
803 if sys.version_info >= (3, 0):
804 location = location.encode('iso-8859-1').decode('utf-8')
805 location_escaped = escape_url(location)
806 if location != location_escaped:
807 del resp.headers['Location']
808 resp.headers['Location'] = location_escaped
811 https_request = http_request
812 https_response = http_response
815 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
816 def __init__(self, params, https_conn_class=None, *args, **kwargs):
817 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
818 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
819 self._params = params
821 def https_open(self, req):
823 if hasattr(self, '_context'): # python > 2.6
824 kwargs['context'] = self._context
825 if hasattr(self, '_check_hostname'): # python 3.x
826 kwargs['check_hostname'] = self._check_hostname
827 return self.do_open(functools.partial(
828 _create_http_connection, self, self._https_conn_class, True),
832 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
833 def __init__(self, cookiejar=None):
834 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
836 def http_response(self, request, response):
837 # Python 2 will choke on next HTTP request in row if there are non-ASCII
838 # characters in Set-Cookie HTTP header of last response (see
839 # https://github.com/rg3/youtube-dl/issues/6769).
840 # In order to at least prevent crashing we will percent encode Set-Cookie
841 # header before HTTPCookieProcessor starts processing it.
842 # if sys.version_info < (3, 0) and response.headers:
843 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
844 # set_cookie = response.headers.get(set_cookie_header)
846 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
847 # if set_cookie != set_cookie_escaped:
848 # del response.headers[set_cookie_header]
849 # response.headers[set_cookie_header] = set_cookie_escaped
850 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
852 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
853 https_response = http_response
856 def parse_iso8601(date_str, delimiter='T', timezone=None):
857 """ Return a UNIX timestamp from the given date """
862 date_str = re.sub(r'\.[0-9]+', '', date_str)
866 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
869 timezone = datetime.timedelta()
871 date_str = date_str[:-len(m.group(0))]
872 if not m.group('sign'):
873 timezone = datetime.timedelta()
875 sign = 1 if m.group('sign') == '+' else -1
876 timezone = datetime.timedelta(
877 hours=sign * int(m.group('hours')),
878 minutes=sign * int(m.group('minutes')))
880 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
881 dt = datetime.datetime.strptime(date_str, date_format) - timezone
882 return calendar.timegm(dt.timetuple())
887 def unified_strdate(date_str, day_first=True):
888 """Return a string with the date in the format YYYYMMDD"""
894 date_str = date_str.replace(',', ' ')
895 # %z (UTC offset) is only supported in python>=3.2
896 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
897 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
898 # Remove AM/PM + timezone
899 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
901 format_expressions = [
914 '%Y-%m-%d %H:%M:%S.%f',
917 '%Y-%m-%dT%H:%M:%SZ',
918 '%Y-%m-%dT%H:%M:%S.%fZ',
919 '%Y-%m-%dT%H:%M:%S.%f0Z',
921 '%Y-%m-%dT%H:%M:%S.%f',
925 format_expressions.extend([
933 format_expressions.extend([
940 for expression in format_expressions:
942 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
945 if upload_date is None:
946 timetuple = email.utils.parsedate_tz(date_str)
948 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
949 if upload_date is not None:
950 return compat_str(upload_date)
953 def determine_ext(url, default_ext='unknown_video'):
956 guess = url.partition('?')[0].rpartition('.')[2]
957 if re.match(r'^[A-Za-z0-9]+$', guess):
959 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
960 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
961 return guess.rstrip('/')
966 def subtitles_filename(filename, sub_lang, sub_format):
967 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
970 def date_from_str(date_str):
972 Return a datetime object from a string in the format YYYYMMDD or
973 (now|today)[+-][0-9](day|week|month|year)(s)?"""
974 today = datetime.date.today()
975 if date_str in ('now', 'today'):
977 if date_str == 'yesterday':
978 return today - datetime.timedelta(days=1)
979 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
980 if match is not None:
981 sign = match.group('sign')
982 time = int(match.group('time'))
985 unit = match.group('unit')
986 # A bad approximation?
994 delta = datetime.timedelta(**{unit: time})
996 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
999 def hyphenate_date(date_str):
1001 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1002 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1003 if match is not None:
1004 return '-'.join(match.groups())
1009 class DateRange(object):
1010 """Represents a time interval between two dates"""
1012 def __init__(self, start=None, end=None):
1013 """start and end must be strings in the format accepted by date"""
1014 if start is not None:
1015 self.start = date_from_str(start)
1017 self.start = datetime.datetime.min.date()
1019 self.end = date_from_str(end)
1021 self.end = datetime.datetime.max.date()
1022 if self.start > self.end:
1023 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1027 """Returns a range that only contains the given day"""
1028 return cls(day, day)
1030 def __contains__(self, date):
1031 """Check if the date is in the range"""
1032 if not isinstance(date, datetime.date):
1033 date = date_from_str(date)
1034 return self.start <= date <= self.end
1037 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1040 def platform_name():
1041 """ Returns the platform name as a compat_str """
1042 res = platform.platform()
1043 if isinstance(res, bytes):
1044 res = res.decode(preferredencoding())
1046 assert isinstance(res, compat_str)
1050 def _windows_write_string(s, out):
1051 """ Returns True if the string was written using special methods,
1052 False if it has yet to be written out."""
1053 # Adapted from http://stackoverflow.com/a/3259271/35070
1056 import ctypes.wintypes
1064 fileno = out.fileno()
1065 except AttributeError:
1066 # If the output stream doesn't have a fileno, it's virtual
1068 except io.UnsupportedOperation:
1069 # Some strange Windows pseudo files?
1071 if fileno not in WIN_OUTPUT_IDS:
1074 GetStdHandle = ctypes.WINFUNCTYPE(
1075 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1076 (b'GetStdHandle', ctypes.windll.kernel32))
1077 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1079 WriteConsoleW = ctypes.WINFUNCTYPE(
1080 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1081 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1082 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1083 written = ctypes.wintypes.DWORD(0)
1085 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1086 FILE_TYPE_CHAR = 0x0002
1087 FILE_TYPE_REMOTE = 0x8000
1088 GetConsoleMode = ctypes.WINFUNCTYPE(
1089 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1090 ctypes.POINTER(ctypes.wintypes.DWORD))(
1091 (b'GetConsoleMode', ctypes.windll.kernel32))
1092 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1094 def not_a_console(handle):
1095 if handle == INVALID_HANDLE_VALUE or handle is None:
1097 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1098 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1100 if not_a_console(h):
1103 def next_nonbmp_pos(s):
1105 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1106 except StopIteration:
1110 count = min(next_nonbmp_pos(s), 1024)
1112 ret = WriteConsoleW(
1113 h, s, count if count else 2, ctypes.byref(written), None)
1115 raise OSError('Failed to write string')
1116 if not count: # We just wrote a non-BMP character
1117 assert written.value == 2
1120 assert written.value > 0
1121 s = s[written.value:]
1125 def write_string(s, out=None, encoding=None):
1128 assert type(s) == compat_str
1130 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1131 if _windows_write_string(s, out):
1134 if ('b' in getattr(out, 'mode', '') or
1135 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1136 byt = s.encode(encoding or preferredencoding(), 'ignore')
1138 elif hasattr(out, 'buffer'):
1139 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1140 byt = s.encode(enc, 'ignore')
1141 out.buffer.write(byt)
1147 def bytes_to_intlist(bs):
1150 if isinstance(bs[0], int): # Python 3
1153 return [ord(c) for c in bs]
1156 def intlist_to_bytes(xs):
1159 return struct_pack('%dB' % len(xs), *xs)
1162 # Cross-platform file locking
1163 if sys.platform == 'win32':
1164 import ctypes.wintypes
1167 class OVERLAPPED(ctypes.Structure):
1169 ('Internal', ctypes.wintypes.LPVOID),
1170 ('InternalHigh', ctypes.wintypes.LPVOID),
1171 ('Offset', ctypes.wintypes.DWORD),
1172 ('OffsetHigh', ctypes.wintypes.DWORD),
1173 ('hEvent', ctypes.wintypes.HANDLE),
1176 kernel32 = ctypes.windll.kernel32
1177 LockFileEx = kernel32.LockFileEx
1178 LockFileEx.argtypes = [
1179 ctypes.wintypes.HANDLE, # hFile
1180 ctypes.wintypes.DWORD, # dwFlags
1181 ctypes.wintypes.DWORD, # dwReserved
1182 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1183 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1184 ctypes.POINTER(OVERLAPPED) # Overlapped
1186 LockFileEx.restype = ctypes.wintypes.BOOL
1187 UnlockFileEx = kernel32.UnlockFileEx
1188 UnlockFileEx.argtypes = [
1189 ctypes.wintypes.HANDLE, # hFile
1190 ctypes.wintypes.DWORD, # dwReserved
1191 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1192 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1193 ctypes.POINTER(OVERLAPPED) # Overlapped
1195 UnlockFileEx.restype = ctypes.wintypes.BOOL
1196 whole_low = 0xffffffff
1197 whole_high = 0x7fffffff
1199 def _lock_file(f, exclusive):
1200 overlapped = OVERLAPPED()
1201 overlapped.Offset = 0
1202 overlapped.OffsetHigh = 0
1203 overlapped.hEvent = 0
1204 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1205 handle = msvcrt.get_osfhandle(f.fileno())
1206 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1207 whole_low, whole_high, f._lock_file_overlapped_p):
1208 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1210 def _unlock_file(f):
1211 assert f._lock_file_overlapped_p
1212 handle = msvcrt.get_osfhandle(f.fileno())
1213 if not UnlockFileEx(handle, 0,
1214 whole_low, whole_high, f._lock_file_overlapped_p):
1215 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1220 def _lock_file(f, exclusive):
1221 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1223 def _unlock_file(f):
1224 fcntl.flock(f, fcntl.LOCK_UN)
1227 class locked_file(object):
1228 def __init__(self, filename, mode, encoding=None):
1229 assert mode in ['r', 'a', 'w']
1230 self.f = io.open(filename, mode, encoding=encoding)
1233 def __enter__(self):
1234 exclusive = self.mode != 'r'
1236 _lock_file(self.f, exclusive)
1242 def __exit__(self, etype, value, traceback):
1244 _unlock_file(self.f)
1251 def write(self, *args):
1252 return self.f.write(*args)
1254 def read(self, *args):
1255 return self.f.read(*args)
1258 def get_filesystem_encoding():
1259 encoding = sys.getfilesystemencoding()
1260 return encoding if encoding is not None else 'utf-8'
1263 def shell_quote(args):
1265 encoding = get_filesystem_encoding()
1267 if isinstance(a, bytes):
1268 # We may get a filename encoded with 'encodeFilename'
1269 a = a.decode(encoding)
1270 quoted_args.append(pipes.quote(a))
1271 return ' '.join(quoted_args)
1274 def smuggle_url(url, data):
1275 """ Pass additional data in a URL for internal use. """
1277 sdata = compat_urllib_parse.urlencode(
1278 {'__youtubedl_smuggle': json.dumps(data)})
1279 return url + '#' + sdata
1282 def unsmuggle_url(smug_url, default=None):
1283 if '#__youtubedl_smuggle' not in smug_url:
1284 return smug_url, default
1285 url, _, sdata = smug_url.rpartition('#')
1286 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1287 data = json.loads(jsond)
1291 def format_bytes(bytes):
1294 if type(bytes) is str:
1295 bytes = float(bytes)
1299 exponent = int(math.log(bytes, 1024.0))
1300 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1301 converted = float(bytes) / float(1024 ** exponent)
1302 return '%.2f%s' % (converted, suffix)
1305 def parse_filesize(s):
1309 # The lower-case forms are of course incorrect and unofficial,
1310 # but we support those too
1348 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1350 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1354 num_str = m.group('num').replace(',', '.')
1355 mult = _UNIT_TABLE[m.group('unit')]
1356 return int(float(num_str) * mult)
1359 def month_by_name(name):
1360 """ Return the number of a month by (locale-independently) English name """
1363 return ENGLISH_MONTH_NAMES.index(name) + 1
1368 def month_by_abbreviation(abbrev):
1369 """ Return the number of a month by (locale-independently) English
1373 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1378 def fix_xml_ampersands(xml_str):
1379 """Replace all the '&' by '&' in XML"""
1381 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1386 def setproctitle(title):
1387 assert isinstance(title, compat_str)
1389 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1392 title_bytes = title.encode('utf-8')
1393 buf = ctypes.create_string_buffer(len(title_bytes))
1394 buf.value = title_bytes
1396 libc.prctl(15, buf, 0, 0, 0)
1397 except AttributeError:
1398 return # Strange libc, just skip this
1401 def remove_start(s, start):
1402 if s.startswith(start):
1403 return s[len(start):]
1407 def remove_end(s, end):
1409 return s[:-len(end)]
1413 def remove_quotes(s):
1414 if s is None or len(s) < 2:
1416 for quote in ('"', "'", ):
1417 if s[0] == quote and s[-1] == quote:
1422 def url_basename(url):
1423 path = compat_urlparse.urlparse(url).path
1424 return path.strip('/').split('/')[-1]
1427 class HEADRequest(compat_urllib_request.Request):
1428 def get_method(self):
1432 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1435 v = getattr(v, get_attr, None)
1441 return int(v) * invscale // scale
1446 def str_or_none(v, default=None):
1447 return default if v is None else compat_str(v)
1450 def str_to_int(int_str):
1451 """ A more relaxed version of int_or_none """
1454 int_str = re.sub(r'[,\.\+]', '', int_str)
1458 def float_or_none(v, scale=1, invscale=1, default=None):
1462 return float(v) * invscale / scale
1467 def parse_duration(s):
1468 if not isinstance(s, compat_basestring):
1476 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1477 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1479 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1482 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1483 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1485 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1487 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1492 if m.group('only_mins'):
1493 return float_or_none(m.group('only_mins'), invscale=60)
1494 if m.group('only_hours'):
1495 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1497 res += int(m.group('secs'))
1498 if m.group('mins_reversed'):
1499 res += int(m.group('mins_reversed')) * 60
1501 res += int(m.group('mins')) * 60
1502 if m.group('hours'):
1503 res += int(m.group('hours')) * 60 * 60
1504 if m.group('hours_reversed'):
1505 res += int(m.group('hours_reversed')) * 60 * 60
1507 res += int(m.group('days')) * 24 * 60 * 60
1509 res += float(m.group('ms'))
1513 def prepend_extension(filename, ext, expected_real_ext=None):
1514 name, real_ext = os.path.splitext(filename)
1516 '{0}.{1}{2}'.format(name, ext, real_ext)
1517 if not expected_real_ext or real_ext[1:] == expected_real_ext
1518 else '{0}.{1}'.format(filename, ext))
1521 def replace_extension(filename, ext, expected_real_ext=None):
1522 name, real_ext = os.path.splitext(filename)
1523 return '{0}.{1}'.format(
1524 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1528 def check_executable(exe, args=[]):
1529 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1530 args can be a list of arguments for a short output (like -version) """
1532 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1538 def get_exe_version(exe, args=['--version'],
1539 version_re=None, unrecognized='present'):
1540 """ Returns the version of the specified executable,
1541 or False if the executable is not present """
1543 out, _ = subprocess.Popen(
1544 [encodeArgument(exe)] + args,
1545 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1548 if isinstance(out, bytes): # Python 2.x
1549 out = out.decode('ascii', 'ignore')
1550 return detect_exe_version(out, version_re, unrecognized)
1553 def detect_exe_version(output, version_re=None, unrecognized='present'):
1554 assert isinstance(output, compat_str)
1555 if version_re is None:
1556 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1557 m = re.search(version_re, output)
1564 class PagedList(object):
1566 # This is only useful for tests
1567 return len(self.getslice())
1570 class OnDemandPagedList(PagedList):
1571 def __init__(self, pagefunc, pagesize, use_cache=False):
1572 self._pagefunc = pagefunc
1573 self._pagesize = pagesize
1574 self._use_cache = use_cache
1578 def getslice(self, start=0, end=None):
1580 for pagenum in itertools.count(start // self._pagesize):
1581 firstid = pagenum * self._pagesize
1582 nextfirstid = pagenum * self._pagesize + self._pagesize
1583 if start >= nextfirstid:
1588 page_results = self._cache.get(pagenum)
1589 if page_results is None:
1590 page_results = list(self._pagefunc(pagenum))
1592 self._cache[pagenum] = page_results
1595 start % self._pagesize
1596 if firstid <= start < nextfirstid
1600 ((end - 1) % self._pagesize) + 1
1601 if (end is not None and firstid <= end <= nextfirstid)
1604 if startv != 0 or endv is not None:
1605 page_results = page_results[startv:endv]
1606 res.extend(page_results)
1608 # A little optimization - if current page is not "full", ie. does
1609 # not contain page_size videos then we can assume that this page
1610 # is the last one - there are no more ids on further pages -
1611 # i.e. no need to query again.
1612 if len(page_results) + startv < self._pagesize:
1615 # If we got the whole page, but the next page is not interesting,
1616 # break out early as well
1617 if end == nextfirstid:
1622 class InAdvancePagedList(PagedList):
1623 def __init__(self, pagefunc, pagecount, pagesize):
1624 self._pagefunc = pagefunc
1625 self._pagecount = pagecount
1626 self._pagesize = pagesize
1628 def getslice(self, start=0, end=None):
1630 start_page = start // self._pagesize
1632 self._pagecount if end is None else (end // self._pagesize + 1))
1633 skip_elems = start - start_page * self._pagesize
1634 only_more = None if end is None else end - start
1635 for pagenum in range(start_page, end_page):
1636 page = list(self._pagefunc(pagenum))
1638 page = page[skip_elems:]
1640 if only_more is not None:
1641 if len(page) < only_more:
1642 only_more -= len(page)
1644 page = page[:only_more]
1651 def uppercase_escape(s):
1652 unicode_escape = codecs.getdecoder('unicode_escape')
1654 r'\\U[0-9a-fA-F]{8}',
1655 lambda m: unicode_escape(m.group(0))[0],
1659 def lowercase_escape(s):
1660 unicode_escape = codecs.getdecoder('unicode_escape')
1662 r'\\u[0-9a-fA-F]{4}',
1663 lambda m: unicode_escape(m.group(0))[0],
1667 def escape_rfc3986(s):
1668 """Escape non-ASCII characters as suggested by RFC 3986"""
1669 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1670 s = s.encode('utf-8')
1671 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1674 def escape_url(url):
1675 """Escape URL as suggested by RFC 3986"""
1676 url_parsed = compat_urllib_parse_urlparse(url)
1677 return url_parsed._replace(
1678 path=escape_rfc3986(url_parsed.path),
1679 params=escape_rfc3986(url_parsed.params),
1680 query=escape_rfc3986(url_parsed.query),
1681 fragment=escape_rfc3986(url_parsed.fragment)
1685 struct.pack('!I', 0)
1687 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1688 def struct_pack(spec, *args):
1689 if isinstance(spec, compat_str):
1690 spec = spec.encode('ascii')
1691 return struct.pack(spec, *args)
1693 def struct_unpack(spec, *args):
1694 if isinstance(spec, compat_str):
1695 spec = spec.encode('ascii')
1696 return struct.unpack(spec, *args)
1698 struct_pack = struct.pack
1699 struct_unpack = struct.unpack
1702 def read_batch_urls(batch_fd):
1704 if not isinstance(url, compat_str):
1705 url = url.decode('utf-8', 'replace')
1706 BOM_UTF8 = '\xef\xbb\xbf'
1707 if url.startswith(BOM_UTF8):
1708 url = url[len(BOM_UTF8):]
1710 if url.startswith(('#', ';', ']')):
1714 with contextlib.closing(batch_fd) as fd:
1715 return [url for url in map(fixup, fd) if url]
1718 def urlencode_postdata(*args, **kargs):
1719 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1722 def encode_dict(d, encoding='utf-8'):
1724 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1725 return dict((encode(k), encode(v)) for k, v in d.items())
1728 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1729 if isinstance(key_or_keys, (list, tuple)):
1730 for key in key_or_keys:
1731 if key not in d or d[key] is None or skip_false_values and not d[key]:
1735 return d.get(key_or_keys, default)
1738 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1739 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1751 def parse_age_limit(s):
1754 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1755 return int(m.group('age')) if m else US_RATINGS.get(s)
1758 def strip_jsonp(code):
1760 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1763 def js_to_json(code):
1766 if v in ('true', 'false', 'null'):
1768 if v.startswith('"'):
1769 v = re.sub(r"\\'", "'", v[1:-1])
1770 elif v.startswith("'"):
1772 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1779 res = re.sub(r'''(?x)
1780 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1781 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1782 [a-zA-Z_][.a-zA-Z_0-9]*
1784 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1788 def qualities(quality_ids):
1789 """ Get a numeric quality value out of a list of possible values """
1792 return quality_ids.index(qid)
1798 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1801 def limit_length(s, length):
1802 """ Add ellipses to overly long strings """
1807 return s[:length - len(ELLIPSES)] + ELLIPSES
1811 def version_tuple(v):
1812 return tuple(int(e) for e in re.split(r'[-.]', v))
1815 def is_outdated_version(version, limit, assume_new=True):
1817 return not assume_new
1819 return version_tuple(version) < version_tuple(limit)
1821 return not assume_new
1824 def ytdl_is_updateable():
1825 """ Returns if youtube-dl can be updated with -U """
1826 from zipimport import zipimporter
1828 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1831 def args_to_str(args):
1832 # Get a short string representation for a subprocess command
1833 return ' '.join(shlex_quote(a) for a in args)
1836 def error_to_compat_str(err):
1838 # On python 2 error byte string must be decoded with proper
1839 # encoding rather than ascii
1840 if sys.version_info[0] < 3:
1841 err_str = err_str.decode(preferredencoding())
1845 def mimetype2ext(mt):
1852 _, _, res = mt.rpartition('/')
1856 'smptett+xml': 'tt',
1862 'x-mp4-fragmented': 'mp4',
1867 def urlhandle_detect_ext(url_handle):
1870 getheader = lambda h: url_handle.headers[h]
1871 except AttributeError: # Python < 3
1872 getheader = url_handle.info().getheader
1874 cd = getheader('Content-Disposition')
1876 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1878 e = determine_ext(m.group('filename'), default_ext=None)
1882 return mimetype2ext(getheader('Content-Type'))
1885 def encode_data_uri(data, mime_type):
1886 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1889 def age_restricted(content_limit, age_limit):
1890 """ Returns True iff the content should be blocked """
1892 if age_limit is None: # No limit set
1894 if content_limit is None:
1895 return False # Content available for everyone
1896 return age_limit < content_limit
1899 def is_html(first_bytes):
1900 """ Detect whether a file contains HTML by examining its first bytes. """
1903 (b'\xef\xbb\xbf', 'utf-8'),
1904 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1905 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1906 (b'\xff\xfe', 'utf-16-le'),
1907 (b'\xfe\xff', 'utf-16-be'),
1909 for bom, enc in BOMS:
1910 if first_bytes.startswith(bom):
1911 s = first_bytes[len(bom):].decode(enc, 'replace')
1914 s = first_bytes.decode('utf-8', 'replace')
1916 return re.match(r'^\s*<', s)
1919 def determine_protocol(info_dict):
1920 protocol = info_dict.get('protocol')
1921 if protocol is not None:
1924 url = info_dict['url']
1925 if url.startswith('rtmp'):
1927 elif url.startswith('mms'):
1929 elif url.startswith('rtsp'):
1932 ext = determine_ext(url)
1938 return compat_urllib_parse_urlparse(url).scheme
1941 def render_table(header_row, data):
1942 """ Render a list of rows, each as a list of values """
1943 table = [header_row] + data
1944 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1945 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1946 return '\n'.join(format_str % tuple(row) for row in table)
1949 def _match_one(filter_part, dct):
1950 COMPARISON_OPERATORS = {
1958 operator_rex = re.compile(r'''(?x)\s*
1960 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1962 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1963 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1966 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1967 m = operator_rex.search(filter_part)
1969 op = COMPARISON_OPERATORS[m.group('op')]
1970 if m.group('strval') is not None:
1971 if m.group('op') not in ('=', '!='):
1973 'Operator %s does not support string values!' % m.group('op'))
1974 comparison_value = m.group('strval')
1977 comparison_value = int(m.group('intval'))
1979 comparison_value = parse_filesize(m.group('intval'))
1980 if comparison_value is None:
1981 comparison_value = parse_filesize(m.group('intval') + 'B')
1982 if comparison_value is None:
1984 'Invalid integer value %r in filter part %r' % (
1985 m.group('intval'), filter_part))
1986 actual_value = dct.get(m.group('key'))
1987 if actual_value is None:
1988 return m.group('none_inclusive')
1989 return op(actual_value, comparison_value)
1992 '': lambda v: v is not None,
1993 '!': lambda v: v is None,
1995 operator_rex = re.compile(r'''(?x)\s*
1996 (?P<op>%s)\s*(?P<key>[a-z_]+)
1998 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1999 m = operator_rex.search(filter_part)
2001 op = UNARY_OPERATORS[m.group('op')]
2002 actual_value = dct.get(m.group('key'))
2003 return op(actual_value)
2005 raise ValueError('Invalid filter part %r' % filter_part)
2008 def match_str(filter_str, dct):
2009 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2012 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2015 def match_filter_func(filter_str):
2016 def _match_func(info_dict):
2017 if match_str(filter_str, info_dict):
2020 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2021 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2025 def parse_dfxp_time_expr(time_expr):
2029 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2031 return float(mobj.group('time_offset'))
2033 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2035 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2038 def srt_subtitles_timecode(seconds):
2039 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2042 def dfxp2srt(dfxp_data):
2043 _x = functools.partial(xpath_with_ns, ns_map={
2044 'ttml': 'http://www.w3.org/ns/ttml',
2045 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2048 class TTMLPElementParser(object):
2051 def start(self, tag, attrib):
2052 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2058 def data(self, data):
2062 return self.out.strip()
2064 def parse_node(node):
2065 target = TTMLPElementParser()
2066 parser = xml.etree.ElementTree.XMLParser(target=target)
2067 parser.feed(xml.etree.ElementTree.tostring(node))
2068 return parser.close()
2070 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2072 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2075 raise ValueError('Invalid dfxp/TTML subtitle')
2077 for para, index in zip(paras, itertools.count(1)):
2078 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2079 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2080 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2081 if begin_time is None:
2086 end_time = begin_time + dur
2087 out.append('%d\n%s --> %s\n%s\n\n' % (
2089 srt_subtitles_timecode(begin_time),
2090 srt_subtitles_timecode(end_time),
2096 def cli_option(params, command_option, param):
2097 param = params.get(param)
2098 return [command_option, param] if param is not None else []
2101 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2102 param = params.get(param)
2103 assert isinstance(param, bool)
2105 return [command_option + separator + (true_value if param else false_value)]
2106 return [command_option, true_value if param else false_value]
2109 def cli_valueless_option(params, command_option, param, expected_value=True):
2110 param = params.get(param)
2111 return [command_option] if param == expected_value else []
2114 def cli_configuration_args(params, param, default=[]):
2115 ex_args = params.get(param)
2118 assert isinstance(ex_args, list)
2122 class ISO639Utils(object):
2123 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2312 def short2long(cls, code):
2313 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2314 return cls._lang_map.get(code[:2])
2317 def long2short(cls, code):
2318 """Convert language code from ISO 639-2/T to ISO 639-1"""
2319 for short_name, long_name in cls._lang_map.items():
2320 if long_name == code:
2324 class ISO3166Utils(object):
2325 # From http://data.okfn.org/data/core/country-list
2327 'AF': 'Afghanistan',
2328 'AX': 'Ã…land Islands',
2331 'AS': 'American Samoa',
2336 'AG': 'Antigua and Barbuda',
2353 'BO': 'Bolivia, Plurinational State of',
2354 'BQ': 'Bonaire, Sint Eustatius and Saba',
2355 'BA': 'Bosnia and Herzegovina',
2357 'BV': 'Bouvet Island',
2359 'IO': 'British Indian Ocean Territory',
2360 'BN': 'Brunei Darussalam',
2362 'BF': 'Burkina Faso',
2368 'KY': 'Cayman Islands',
2369 'CF': 'Central African Republic',
2373 'CX': 'Christmas Island',
2374 'CC': 'Cocos (Keeling) Islands',
2378 'CD': 'Congo, the Democratic Republic of the',
2379 'CK': 'Cook Islands',
2381 'CI': 'Côte d\'Ivoire',
2386 'CZ': 'Czech Republic',
2390 'DO': 'Dominican Republic',
2393 'SV': 'El Salvador',
2394 'GQ': 'Equatorial Guinea',
2398 'FK': 'Falkland Islands (Malvinas)',
2399 'FO': 'Faroe Islands',
2403 'GF': 'French Guiana',
2404 'PF': 'French Polynesia',
2405 'TF': 'French Southern Territories',
2420 'GW': 'Guinea-Bissau',
2423 'HM': 'Heard Island and McDonald Islands',
2424 'VA': 'Holy See (Vatican City State)',
2431 'IR': 'Iran, Islamic Republic of',
2434 'IM': 'Isle of Man',
2444 'KP': 'Korea, Democratic People\'s Republic of',
2445 'KR': 'Korea, Republic of',
2448 'LA': 'Lao People\'s Democratic Republic',
2454 'LI': 'Liechtenstein',
2458 'MK': 'Macedonia, the Former Yugoslav Republic of',
2465 'MH': 'Marshall Islands',
2471 'FM': 'Micronesia, Federated States of',
2472 'MD': 'Moldova, Republic of',
2483 'NL': 'Netherlands',
2484 'NC': 'New Caledonia',
2485 'NZ': 'New Zealand',
2490 'NF': 'Norfolk Island',
2491 'MP': 'Northern Mariana Islands',
2496 'PS': 'Palestine, State of',
2498 'PG': 'Papua New Guinea',
2501 'PH': 'Philippines',
2505 'PR': 'Puerto Rico',
2509 'RU': 'Russian Federation',
2511 'BL': 'Saint Barthélemy',
2512 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2513 'KN': 'Saint Kitts and Nevis',
2514 'LC': 'Saint Lucia',
2515 'MF': 'Saint Martin (French part)',
2516 'PM': 'Saint Pierre and Miquelon',
2517 'VC': 'Saint Vincent and the Grenadines',
2520 'ST': 'Sao Tome and Principe',
2521 'SA': 'Saudi Arabia',
2525 'SL': 'Sierra Leone',
2527 'SX': 'Sint Maarten (Dutch part)',
2530 'SB': 'Solomon Islands',
2532 'ZA': 'South Africa',
2533 'GS': 'South Georgia and the South Sandwich Islands',
2534 'SS': 'South Sudan',
2539 'SJ': 'Svalbard and Jan Mayen',
2542 'CH': 'Switzerland',
2543 'SY': 'Syrian Arab Republic',
2544 'TW': 'Taiwan, Province of China',
2546 'TZ': 'Tanzania, United Republic of',
2548 'TL': 'Timor-Leste',
2552 'TT': 'Trinidad and Tobago',
2555 'TM': 'Turkmenistan',
2556 'TC': 'Turks and Caicos Islands',
2560 'AE': 'United Arab Emirates',
2561 'GB': 'United Kingdom',
2562 'US': 'United States',
2563 'UM': 'United States Minor Outlying Islands',
2567 'VE': 'Venezuela, Bolivarian Republic of',
2569 'VG': 'Virgin Islands, British',
2570 'VI': 'Virgin Islands, U.S.',
2571 'WF': 'Wallis and Futuna',
2572 'EH': 'Western Sahara',
2579 def short2full(cls, code):
2580 """Convert an ISO 3166-2 country code to the corresponding full name"""
2581 return cls._country_map.get(code.upper())
2584 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2585 def __init__(self, proxies=None):
2586 # Set default handlers
2587 for type in ('http', 'https'):
2588 setattr(self, '%s_open' % type,
2589 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2590 meth(r, proxy, type))
2591 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2593 def proxy_open(self, req, proxy, type):
2594 req_proxy = req.headers.get('Ytdl-request-proxy')
2595 if req_proxy is not None:
2597 del req.headers['Ytdl-request-proxy']
2599 if proxy == '__noproxy__':
2600 return None # No Proxy
2601 return compat_urllib_request.ProxyHandler.proxy_open(
2602 self, req, proxy, type)
2605 def ohdave_rsa_encrypt(data, exponent, modulus):
2607 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2610 data: data to encrypt, bytes-like object
2611 exponent, modulus: parameter e and N of RSA algorithm, both integer
2612 Output: hex string of encrypted data
2614 Limitation: supports one block encryption only
2617 payload = int(binascii.hexlify(data[::-1]), 16)
2618 encrypted = pow(payload, exponent, modulus)
2619 return '%x' % encrypted
2622 def encode_base_n(num, n, table=None):
2623 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2625 table = FULL_TABLE[:n]
2628 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2635 ret = table[num % n] + ret
2640 def decode_packed_codes(code):
2642 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2644 obfucasted_code, base, count, symbols = mobj.groups()
2647 symbols = symbols.split('|')
2652 base_n_count = encode_base_n(count, base)
2653 symbol_table[base_n_count] = symbols[count] or base_n_count
2656 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],