From 8f5023a006ed0bdc08432c0c283e28d77ab3bc72 Mon Sep 17 00:00:00 2001
From: Matt Corallo <git@bluematt.me>
Date: Sun, 26 Sep 2021 00:09:17 +0000
Subject: [PATCH] Avoid disconnecting all peers if user code is slow

In the sample client (and likely other downstream users), event
processing may block on slow operations (e.g. Bitcoin Core RPCs)
and ChannelManager persistence may take some time. This should be
fine, except that we consider this a case of possible backgrounding
and disconnect all of our peers when it happens.

Instead, we here avoid considering event processing time in the
time between PeerManager events.
---
 lightning-background-processor/src/lib.rs | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/lightning-background-processor/src/lib.rs b/lightning-background-processor/src/lib.rs
index 2dbc8053b..363dfba93 100644
--- a/lightning-background-processor/src/lib.rs
+++ b/lightning-background-processor/src/lib.rs
@@ -219,11 +219,17 @@ impl BackgroundProcessor {
 			let mut have_pruned = false;
 
 			loop {
-				peer_manager.process_events();
+				peer_manager.process_events(); // Note that this may block on ChannelManager's locking
 				channel_manager.process_pending_events(&event_handler);
 				chain_monitor.process_pending_events(&event_handler);
+
+				// We wait up to 100ms, but track how long it takes to detect being put to sleep,
+				// see `await_start`'s use below.
+				let await_start = Instant::now();
 				let updates_available =
 					channel_manager.await_persistable_update_timeout(Duration::from_millis(100));
+				let await_time = await_start.elapsed();
+
 				if updates_available {
 					log_trace!(logger, "Persisting ChannelManager...");
 					persister.persist_manager(&*channel_manager)?;
@@ -239,15 +245,20 @@ impl BackgroundProcessor {
 					channel_manager.timer_tick_occurred();
 					last_freshness_call = Instant::now();
 				}
-				if last_ping_call.elapsed().as_secs() > PING_TIMER * 2 {
+				if await_time > Duration::from_secs(1) {
 					// On various platforms, we may be starved of CPU cycles for several reasons.
 					// E.g. on iOS, if we've been in the background, we will be entirely paused.
 					// Similarly, if we're on a desktop platform and the device has been asleep, we
 					// may not get any cycles.
-					// In any case, if we've been entirely paused for more than double our ping
-					// timer, we should have disconnected all sockets by now (and they're probably
-					// dead anyway), so disconnect them by calling `timer_tick_occurred()` twice.
-					log_trace!(logger, "Awoke after more than double our ping timer, disconnecting peers.");
+					// We detect this by checking if our max-100ms-sleep, above, ran longer than a
+					// full second, at which point we assume sockets may have been killed (they
+					// appear to be at least on some platforms, even if it has only been a second).
+					// Note that we have to take care to not get here just because user event
+					// processing was slow at the top of the loop. For example, the sample client
+					// may call Bitcoin Core RPCs during event handling, which very often takes
+					// more than a handful of seconds to complete, and shouldn't disconnect all our
+					// peers.
+					log_trace!(logger, "100ms sleep took more than a second, disconnecting peers.");
 					peer_manager.disconnect_all_peers();
 					last_ping_call = Instant::now();
 				} else if last_ping_call.elapsed().as_secs() > PING_TIMER {
-- 
2.39.5