From 8f5023a006ed0bdc08432c0c283e28d77ab3bc72 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sun, 26 Sep 2021 00:09:17 +0000 Subject: [PATCH] Avoid disconnecting all peers if user code is slow In the sample client (and likely other downstream users), event processing may block on slow operations (e.g. Bitcoin Core RPCs) and ChannelManager persistence may take some time. This should be fine, except that we consider this a case of possible backgrounding and disconnect all of our peers when it happens. Instead, we here avoid considering event processing time in the time between PeerManager events. --- lightning-background-processor/src/lib.rs | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/lightning-background-processor/src/lib.rs b/lightning-background-processor/src/lib.rs index 2dbc8053b..363dfba93 100644 --- a/lightning-background-processor/src/lib.rs +++ b/lightning-background-processor/src/lib.rs @@ -219,11 +219,17 @@ impl BackgroundProcessor { let mut have_pruned = false; loop { - peer_manager.process_events(); + peer_manager.process_events(); // Note that this may block on ChannelManager's locking channel_manager.process_pending_events(&event_handler); chain_monitor.process_pending_events(&event_handler); + + // We wait up to 100ms, but track how long it takes to detect being put to sleep, + // see `await_start`'s use below. + let await_start = Instant::now(); let updates_available = channel_manager.await_persistable_update_timeout(Duration::from_millis(100)); + let await_time = await_start.elapsed(); + if updates_available { log_trace!(logger, "Persisting ChannelManager..."); persister.persist_manager(&*channel_manager)?; @@ -239,15 +245,20 @@ impl BackgroundProcessor { channel_manager.timer_tick_occurred(); last_freshness_call = Instant::now(); } - if last_ping_call.elapsed().as_secs() > PING_TIMER * 2 { + if await_time > Duration::from_secs(1) { // On various platforms, we may be starved of CPU cycles for several reasons. // E.g. on iOS, if we've been in the background, we will be entirely paused. // Similarly, if we're on a desktop platform and the device has been asleep, we // may not get any cycles. - // In any case, if we've been entirely paused for more than double our ping - // timer, we should have disconnected all sockets by now (and they're probably - // dead anyway), so disconnect them by calling `timer_tick_occurred()` twice. - log_trace!(logger, "Awoke after more than double our ping timer, disconnecting peers."); + // We detect this by checking if our max-100ms-sleep, above, ran longer than a + // full second, at which point we assume sockets may have been killed (they + // appear to be at least on some platforms, even if it has only been a second). + // Note that we have to take care to not get here just because user event + // processing was slow at the top of the loop. For example, the sample client + // may call Bitcoin Core RPCs during event handling, which very often takes + // more than a handful of seconds to complete, and shouldn't disconnect all our + // peers. + log_trace!(logger, "100ms sleep took more than a second, disconnecting peers."); peer_manager.disconnect_all_peers(); last_ping_call = Instant::now(); } else if last_ping_call.elapsed().as_secs() > PING_TIMER { -- 2.39.5