From bf28957f6d488d1258a1e6f21af29344b18e2b75 Mon Sep 17 00:00:00 2001 From: G8XSU <3442979+G8XSU@users.noreply.github.com> Date: Fri, 14 Jun 2024 16:56:36 -0700 Subject: [PATCH] Optimize ChannelMonitor persistence on block connections. Currently, every block connection triggers the persistence of all ChannelMonitors with an updated best_block. This approach poses challenges for large node operators managing thousands of channels. Furthermore, it leads to a thundering herd problem (https://en.wikipedia.org/wiki/Thundering_herd_problem), overwhelming the storage with simultaneous requests. To address this issue, we now persist ChannelMonitors at a regular cadence, spreading their persistence across blocks to mitigate spikes in write operations. Outcome: After doing this, Ldk's IO footprint should be reduced by ~50 times. The processing time required to sync each block will be significantly reduced, particularly for nodes with 1000s of channels, as write latency plays a significant role in this process. As a result, the Node/ChainMonitor will be blocked for a shorter duration, leading to further efficiency gains. --- lightning/src/chain/chainmonitor.rs | 120 +++++++++++++++++++++++--- lightning/src/chain/channelmonitor.rs | 6 ++ lightning/src/chain/onchaintx.rs | 7 ++ 3 files changed, 121 insertions(+), 12 deletions(-) diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs index 58bf5f80..e6bb9d90 100644 --- a/lightning/src/chain/chainmonitor.rs +++ b/lightning/src/chain/chainmonitor.rs @@ -44,6 +44,7 @@ use crate::prelude::*; use crate::sync::{RwLock, RwLockReadGuard, Mutex, MutexGuard}; use core::ops::Deref; use core::sync::atomic::{AtomicUsize, Ordering}; +use bitcoin::hashes::Hash; use bitcoin::secp256k1::PublicKey; /// `Persist` defines behavior for persisting channel monitors: this could mean @@ -260,10 +261,11 @@ where C::Target: chain::Filter, { let err_str = "ChannelMonitor[Update] persistence failed unrecoverably. This indicates we cannot continue normal operation and must shut down."; let funding_outpoints = hash_set_from_iter(self.monitors.read().unwrap().keys().cloned()); + let channel_count = funding_outpoints.len(); for funding_outpoint in funding_outpoints.iter() { let monitor_lock = self.monitors.read().unwrap(); if let Some(monitor_state) = monitor_lock.get(funding_outpoint) { - if self.update_monitor_with_chain_data(header, txdata, &process, funding_outpoint, &monitor_state).is_err() { + if self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state, channel_count).is_err() { // Take the monitors lock for writing so that we poison it and any future // operations going forward fail immediately. core::mem::drop(monitor_lock); @@ -278,7 +280,7 @@ where C::Target: chain::Filter, let monitor_states = self.monitors.write().unwrap(); for (funding_outpoint, monitor_state) in monitor_states.iter() { if !funding_outpoints.contains(funding_outpoint) { - if self.update_monitor_with_chain_data(header, txdata, &process, funding_outpoint, &monitor_state).is_err() { + if self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state, channel_count).is_err() { log_error!(self.logger, "{}", err_str); panic!("{}", err_str); } @@ -297,14 +299,29 @@ where C::Target: chain::Filter, } fn update_monitor_with_chain_data( - &self, header: &Header, txdata: &TransactionData, process: FN, funding_outpoint: &OutPoint, - monitor_state: &MonitorHolder + &self, header: &Header, best_height: Option, txdata: &TransactionData, process: FN, funding_outpoint: &OutPoint, + monitor_state: &MonitorHolder, channel_count: usize, ) -> Result<(), ()> where FN: Fn(&ChannelMonitor, &TransactionData) -> Vec { let monitor = &monitor_state.monitor; let logger = WithChannelMonitor::from(&self.logger, &monitor, None); - let mut txn_outputs; - { - txn_outputs = process(monitor, txdata); + + let mut txn_outputs = process(monitor, txdata); + + let get_partition_key = |funding_outpoint: &OutPoint| { + let funding_txid_hash = funding_outpoint.txid.to_raw_hash(); + let funding_txid_hash_bytes = funding_txid_hash.as_byte_array(); + let funding_txid_u32 = u32::from_be_bytes([funding_txid_hash_bytes[0], funding_txid_hash_bytes[1], funding_txid_hash_bytes[2], funding_txid_hash_bytes[3]]); + funding_txid_u32.wrapping_add(best_height.unwrap_or_default()) + }; + + let partition_factor = if channel_count < 15 { + 5 + } else { + 50 // ~ 8hours + }; + + let has_pending_claims = monitor_state.monitor.has_pending_claims(); + if has_pending_claims || get_partition_key(funding_outpoint) % partition_factor == 0 { log_trace!(logger, "Syncing Channel Monitor for channel {}", log_funding_info!(monitor)); match self.persister.update_persisted_channel(*funding_outpoint, None, monitor) { ChannelMonitorUpdateStatus::Completed => @@ -313,10 +330,10 @@ where C::Target: chain::Filter, ), ChannelMonitorUpdateStatus::InProgress => { log_trace!(logger, "Channel Monitor sync for channel {} in progress.", log_funding_info!(monitor)); - }, + } ChannelMonitorUpdateStatus::UnrecoverableError => { return Err(()); - }, + } } } @@ -870,14 +887,17 @@ impl ChannelMonitor { ); } + /// Returns true if the monitor has pending claim requests that are not fully confirmed yet. + pub fn has_pending_claims(&self) -> bool + { + self.inner.lock().unwrap().onchain_tx_handler.has_pending_claims() + } + /// Triggers rebroadcasts of pending claims from a force-closed channel after a transaction /// signature generation failure. pub fn signer_unblocked( diff --git a/lightning/src/chain/onchaintx.rs b/lightning/src/chain/onchaintx.rs index e5757cfd..07a75361 100644 --- a/lightning/src/chain/onchaintx.rs +++ b/lightning/src/chain/onchaintx.rs @@ -535,6 +535,13 @@ impl OnchainTxHandler { } } + /// Returns true if we are currently tracking any pending claim requests that are not fully + /// confirmed yet. + pub(super) fn has_pending_claims(&self) -> bool + { + self.pending_claim_requests.len() != 0 + } + /// Lightning security model (i.e being able to redeem/timeout HTLC or penalize counterparty /// onchain) lays on the assumption of claim transactions getting confirmed before timelock /// expiration (CSV or CLTV following cases). In case of high-fee spikes, claim tx may get stuck -- 2.30.2