Add an `UnrecoverableError` variant to `ChannelMonitorUpdateStatus`

author Matt Corallo <git@bluematt.me>

Thu, 14 Sep 2023 20:02:46 +0000 (20:02 +0000)

committer Matt Corallo <git@bluematt.me>

Thu, 21 Sep 2023 19:12:31 +0000 (19:12 +0000)
author Matt Corallo <git@bluematt.me>
Thu, 14 Sep 2023 20:02:46 +0000 (20:02 +0000)
committer Matt Corallo <git@bluematt.me>
Thu, 21 Sep 2023 19:12:31 +0000 (19:12 +0000)
diff --git a/lightning-persister/src/fs_store.rs b/lightning-persister/src/fs_store.rs

index 6725e16974b8f3dd1b781af12e819e95db2931cc..42b28018fe9202ce4435a41dc87907433b156e7d 100644 (file)
--- a/lightning-persister/src/fs_store.rs
+++ b/lightning-persister/src/fs_store.rs
@@ -470,7 +470,7 @@ mod tests {
                         index: 0
                 };
                 match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) {
-                       ChannelMonitorUpdateStatus::InProgress => {},
+                       ChannelMonitorUpdateStatus::UnrecoverableError => {},
                         _ => panic!("unexpected result from persisting new channel")
                 }
  
@@ -507,7 +507,7 @@ mod tests {
                         index: 0
                 };
                 match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) {
-                       ChannelMonitorUpdateStatus::InProgress => {},
+                       ChannelMonitorUpdateStatus::UnrecoverableError => {},
                         _ => panic!("unexpected result from persisting new channel")
                 }
  
diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs

index 7dbf308a568c11c4abf45bc07ef1653aff6c4b51..b6909cb3e416890895367532455edb127fdc038c 100644 (file)
--- a/lightning/src/chain/chainmonitor.rs
+++ b/lightning/src/chain/chainmonitor.rs
@@ -78,26 +78,48 @@ impl MonitorUpdateId {
  /// `Persist` defines behavior for persisting channel monitors: this could mean
  /// writing once to disk, and/or uploading to one or more backup services.
  ///
-/// Each method can return two possible values:
-///  * If persistence (including any relevant `fsync()` calls) happens immediately, the
-///    implementation should return [`ChannelMonitorUpdateStatus::Completed`], indicating normal
-///    channel operation should continue.
+/// Persistence can happen in one of two ways - synchronously completing before the trait method
+/// calls return or asynchronously in the background.
  ///
-///  * If persistence happens asynchronously, implementations can return
-///    [`ChannelMonitorUpdateStatus::InProgress`] while the update continues in the background.
-///    Once the update completes, [`ChainMonitor::channel_monitor_updated`] should be called with
-///    the corresponding [`MonitorUpdateId`].
+/// # For those implementing synchronous persistence
  ///
-///    Note that unlike the direct [`chain::Watch`] interface,
-///    [`ChainMonitor::channel_monitor_updated`] must be called once for *each* update which occurs.
+///  * If persistence completes fully (including any relevant `fsync()` calls), the implementation
+///    should return [`ChannelMonitorUpdateStatus::Completed`], indicating normal channel operation
+///    should continue.
  ///
-///    If persistence fails for some reason, implementations should still return
-///    [`ChannelMonitorUpdateStatus::InProgress`] and attempt to shut down or otherwise resolve the
-///    situation ASAP.
+///  * If persistence fails for some reason, implementations should consider returning
+///    [`ChannelMonitorUpdateStatus::InProgress`] and retry all pending persistence operations in
+///    the background with [`ChainMonitor::list_pending_monitor_updates`] and
+///    [`ChainMonitor::get_monitor`].
  ///
-/// Third-party watchtowers may be built as a part of an implementation of this trait, with the
-/// advantage that you can control whether to resume channel operation depending on if an update
-/// has been persisted to a watchtower. For this, you may find the following methods useful:
+///    Once a full [`ChannelMonitor`] has been persisted, all pending updates for that channel can
+///    be marked as complete via [`ChainMonitor::channel_monitor_updated`].
+///
+///    If at some point no further progress can be made towards persisting the pending updates, the
+///    node should simply shut down.
+///
+///  * If the persistence has failed and cannot be retried further (e.g. because of some timeout),
+///    [`ChannelMonitorUpdateStatus::UnrecoverableError`] can be used, though this will result in
+///    an immediate panic and future operations in LDK generally failing.
+///
+/// # For those implementing asynchronous persistence
+///
+///  All calls should generally spawn a background task and immediately return
+///  [`ChannelMonitorUpdateStatus::InProgress`]. Once the update completes,
+///  [`ChainMonitor::channel_monitor_updated`] should be called with the corresponding
+///  [`MonitorUpdateId`].
+///
+///  Note that unlike the direct [`chain::Watch`] interface,
+///  [`ChainMonitor::channel_monitor_updated`] must be called once for *each* update which occurs.
+///
+///  If at some point no further progress can be made towards persisting a pending update, the node
+///  should simply shut down.
+///
+/// # Using remote watchtowers
+///
+/// Watchtowers may be updated as a part of an implementation of this trait, utilizing the async
+/// update process described above while the watchtower is being updated. The following methods are
+/// provided for bulding transactions for a watchtower:
  /// [`ChannelMonitor::initial_counterparty_commitment_tx`],
  /// [`ChannelMonitor::counterparty_commitment_txs_from_update`],
  /// [`ChannelMonitor::sign_to_local_justice_tx`], [`TrustedCommitmentTransaction::revokeable_output_index`],
@@ -279,11 +301,20 @@ where C::Target: chain::Filter,
         where
                 FN: Fn(&ChannelMonitor<ChannelSigner>, &TransactionData) -> Vec<TransactionOutputs>
         {
+               let err_str = "ChannelMonitor[Update] persistence failed unrecoverably. This indicates we cannot continue normal operation and must shut down.";
                 let funding_outpoints: HashSet<OutPoint> = HashSet::from_iter(self.monitors.read().unwrap().keys().cloned());
                 for funding_outpoint in funding_outpoints.iter() {
                         let monitor_lock = self.monitors.read().unwrap();
                         if let Some(monitor_state) = monitor_lock.get(funding_outpoint) {
-                               self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state);
+                               if self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state).is_err() {
+                                       // Take the monitors lock for writing so that we poison it and any future
+                                       // operations going forward fail immediately.
+                                       core::mem::drop(monitor_state);
+                                       core::mem::drop(monitor_lock);
+                                       let _poison = self.monitors.write().unwrap();
+                                       log_error!(self.logger, "{}", err_str);
+                                       panic!("{}", err_str);
+                               }
                         }
                 }
  
@@ -291,7 +322,10 @@ where C::Target: chain::Filter,
                 let monitor_states = self.monitors.write().unwrap();
                 for (funding_outpoint, monitor_state) in monitor_states.iter() {
                         if !funding_outpoints.contains(funding_outpoint) {
-                               self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state);
+                               if self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state).is_err() {
+                                       log_error!(self.logger, "{}", err_str);
+                                       panic!("{}", err_str);
+                               }
                         }
                 }
  
@@ -306,7 +340,10 @@ where C::Target: chain::Filter,
                 }
         }
  
-       fn update_monitor_with_chain_data<FN>(&self, header: &BlockHeader, best_height: Option<u32>, txdata: &TransactionData, process: FN, funding_outpoint: &OutPoint, monitor_state: &MonitorHolder<ChannelSigner>) where FN: Fn(&ChannelMonitor<ChannelSigner>, &TransactionData) -> Vec<TransactionOutputs> {
+       fn update_monitor_with_chain_data<FN>(
+               &self, header: &BlockHeader, best_height: Option<u32>, txdata: &TransactionData,
+               process: FN, funding_outpoint: &OutPoint, monitor_state: &MonitorHolder<ChannelSigner>
+       ) -> Result<(), ()> where FN: Fn(&ChannelMonitor<ChannelSigner>, &TransactionData) -> Vec<TransactionOutputs> {
                 let monitor = &monitor_state.monitor;
                 let mut txn_outputs;
                 {
@@ -331,7 +368,10 @@ where C::Target: chain::Filter,
                                 ChannelMonitorUpdateStatus::InProgress => {
                                         log_debug!(self.logger, "Channel Monitor sync for channel {} in progress, holding events until completion!", log_funding_info!(monitor));
                                         pending_monitor_updates.push(update_id);
-                               }
+                               },
+                               ChannelMonitorUpdateStatus::UnrecoverableError => {
+                                       return Err(());
+                               },
                         }
                 }
  
@@ -351,6 +391,7 @@ where C::Target: chain::Filter,
                                 }
                         }
                 }
+               Ok(())
         }
  
         /// Creates a new `ChainMonitor` used to watch on-chain activity pertaining to channels.
@@ -674,7 +715,12 @@ where C::Target: chain::Filter,
                         },
                         ChannelMonitorUpdateStatus::Completed => {
                                 log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} completed", log_funding_info!(monitor));
-                       }
+                       },
+                       ChannelMonitorUpdateStatus::UnrecoverableError => {
+                               let err_str = "ChannelMonitor[Update] persistence failed unrecoverably. This indicates we cannot continue normal operation and must shut down.";
+                               log_error!(self.logger, "{}", err_str);
+                               panic!("{}", err_str);
+                       },
                 }
                 if let Some(ref chain_source) = self.chain_source {
                         monitor.load_outputs_to_watch(chain_source);
@@ -690,7 +736,7 @@ where C::Target: chain::Filter,
         fn update_channel(&self, funding_txo: OutPoint, update: &ChannelMonitorUpdate) -> ChannelMonitorUpdateStatus {
                 // Update the monitor that watches the channel referred to by the given outpoint.
                 let monitors = self.monitors.read().unwrap();
-               match monitors.get(&funding_txo) {
+               let ret = match monitors.get(&funding_txo) {
                         None => {
                                 log_error!(self.logger, "Failed to update channel monitor: no such monitor registered");
  
@@ -722,6 +768,7 @@ where C::Target: chain::Filter,
                                         ChannelMonitorUpdateStatus::Completed => {
                                                 log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} completed", log_funding_info!(monitor));
                                         },
+                                       ChannelMonitorUpdateStatus::UnrecoverableError => { /* we'll panic in a moment */ },
                                 }
                                 if update_res.is_err() {
                                         ChannelMonitorUpdateStatus::InProgress
@@ -729,7 +776,17 @@ where C::Target: chain::Filter,
                                         persist_res
                                 }
                         }
+               };
+               if let ChannelMonitorUpdateStatus::UnrecoverableError = ret {
+                       // Take the monitors lock for writing so that we poison it and any future
+                       // operations going forward fail immediately.
+                       core::mem::drop(monitors);
+                       let _poison = self.monitors.write().unwrap();
+                       let err_str = "ChannelMonitor[Update] persistence failed unrecoverably. This indicates we cannot continue normal operation and must shut down.";
+                       log_error!(self.logger, "{}", err_str);
+                       panic!("{}", err_str);
                 }
+               ret
         }
  
         fn release_pending_monitor_events(&self) -> Vec<(OutPoint, Vec<MonitorEvent>, Option<PublicKey>)> {
@@ -973,4 +1030,26 @@ mod tests {
                 do_chainsync_pauses_events(false);
                 do_chainsync_pauses_events(true);
         }
+
+       #[test]
+       #[cfg(feature = "std")]
+       fn update_during_chainsync_poisons_channel() {
+               let chanmon_cfgs = create_chanmon_cfgs(2);
+               let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
+               let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
+               let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+               create_announced_chan_between_nodes(&nodes, 0, 1);
+
+               chanmon_cfgs[0].persister.chain_sync_monitor_persistences.lock().unwrap().clear();
+               chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::UnrecoverableError);
+
+               assert!(std::panic::catch_unwind(|| {
+                       // Returning an UnrecoverableError should always panic immediately
+                       connect_blocks(&nodes[0], 1);
+               }).is_err());
+               assert!(std::panic::catch_unwind(|| {
+                       // ...and also poison our locks causing later use to panic as well
+                       core::mem::drop(nodes);
+               }).is_err());
+       }
  }
diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs

index 1abaa77f3e4a88013a1b5550b05db4dbf6eed80a..89e0b155cf68b2db3f24c1a0b23f1e01aacb67b1 100644 (file)
--- a/lightning/src/chain/mod.rs
+++ b/lightning/src/chain/mod.rs
@@ -177,9 +177,14 @@ pub trait Confirm {
  
  /// An enum representing the status of a channel monitor update persistence.
  ///
-/// Note that there is no error variant - any failure to persist a [`ChannelMonitor`] should be
-/// retried indefinitely, the node shut down (as if we cannot update stored data we can't do much
-/// of anything useful).
+/// These are generally used as the return value for an implementation of [`Persist`] which is used
+/// as the storage layer for a [`ChainMonitor`]. See the docs on [`Persist`] for a high-level
+/// explanation of how to handle different cases.
+///
+/// While `UnrecoverableError` is provided as a failure variant, it is not truly "handled" on the
+/// calling side, and generally results in an immediate panic. For those who prefer to avoid
+/// panics, `InProgress` can be used and you can retry the update operation in the background or
+/// shut down cleanly.
  ///
  /// Note that channels should generally *not* be force-closed after a persistence failure.
  /// Force-closing with the latest [`ChannelMonitorUpdate`] applied may result in a transaction
@@ -187,6 +192,8 @@ pub trait Confirm {
  /// latest [`ChannelMonitor`] is not durably persisted anywhere and exists only in memory, naively
  /// calling [`ChannelManager::force_close_broadcasting_latest_txn`] *may result in loss of funds*!
  ///
+/// [`Persist`]: chainmonitor::Persist
+/// [`ChainMonitor`]: chainmonitor::ChainMonitor
  /// [`ChannelManager::force_close_broadcasting_latest_txn`]: crate::ln::channelmanager::ChannelManager::force_close_broadcasting_latest_txn
  #[derive(Clone, Copy, Debug, PartialEq, Eq)]
  pub enum ChannelMonitorUpdateStatus {
@@ -212,8 +219,8 @@ pub enum ChannelMonitorUpdateStatus {
         /// until a [`MonitorEvent::Completed`] is provided, even if you return no error on a later
         /// monitor update for the same channel.
         ///
-       /// For deployments where a copy of ChannelMonitors and other local state are backed up in a
-       /// remote location (with local copies persisted immediately), it is anticipated that all
+       /// For deployments where a copy of [`ChannelMonitor`]s and other local state are backed up in
+       /// a remote location (with local copies persisted immediately), it is anticipated that all
         /// updates will return [`InProgress`] until the remote copies could be updated.
         ///
         /// Note that while fully asynchronous persistence of [`ChannelMonitor`] data is generally
@@ -222,6 +229,18 @@ pub enum ChannelMonitorUpdateStatus {
         ///
         /// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
         InProgress,
+       /// Indicates that an update has failed and will not complete at any point in the future.
+       ///
+       /// Currently returning this variant will cause LDK to immediately panic to encourage immediate
+       /// shutdown. In the future this may be updated to disconnect peers and refuse to continue
+       /// normal operation without a panic.
+       ///
+       /// Applications which wish to perform an orderly shutdown after failure should consider
+       /// returning [`InProgress`] instead and simply shut down without ever marking the update
+       /// complete.
+       ///
+       /// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
+       UnrecoverableError,
  }
  
  /// The `Watch` trait defines behavior for watching on-chain activity pertaining to channels as
@@ -261,8 +280,10 @@ pub trait Watch<ChannelSigner: WriteableEcdsaChannelSigner> {
         /// on-chain or the [`ChannelMonitor`] having decided to do so and broadcasted a transaction),
         /// and the [`ChannelManager`] state will be updated once it sees the funding spend on-chain.
         ///
-       /// If persistence fails, this should return [`ChannelMonitorUpdateStatus::InProgress`] and
-       /// the node should shut down immediately.
+       /// In general, persistence failures should be retried after returning
+       /// [`ChannelMonitorUpdateStatus::InProgress`] and eventually complete. If a failure truly
+       /// cannot be retried, the node should shut down immediately after returning
+       /// [`ChannelMonitorUpdateStatus::UnrecoverableError`], see its documentation for more info.
         ///
         /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager
         fn update_channel(&self, funding_txo: OutPoint, update: &ChannelMonitorUpdate) -> ChannelMonitorUpdateStatus;
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs

index 747a78e8c918309bb1580c5fee146e2ad884911e..8b524d8f3f4d81647f57853b618124960fd8ff84 100644 (file)
--- a/lightning/src/ln/channelmanager.rs
+++ b/lightning/src/ln/channelmanager.rs
@@ -2045,6 +2045,11 @@ macro_rules! handle_new_monitor_update {
         ($self: ident, $update_res: expr, $chan: expr, _internal, $completed: expr) => { {
                 debug_assert!($self.background_events_processed_since_startup.load(Ordering::Acquire));
                 match $update_res {
+                       ChannelMonitorUpdateStatus::UnrecoverableError => {
+                               let err_str = "ChannelMonitor[Update] persistence failed unrecoverably. This indicates we cannot continue normal operation and must shut down.";
+                               log_error!($self.logger, "{}", err_str);
+                               panic!("{}", err_str);
+                       },
                         ChannelMonitorUpdateStatus::InProgress => {
                                 log_debug!($self.logger, "ChannelMonitor update for {} in flight, holding messages until the update completes.",
                                         &$chan.context.channel_id());
diff --git a/lightning/src/ln/functional_test_utils.rs b/lightning/src/ln/functional_test_utils.rs

index 2c4c3c0c2676f05ca3e03961b8db5b8e1df40a0b..5a1a21e2999863b9f6af7b75da6c467b4228e7f3 100644 (file)
--- a/lightning/src/ln/functional_test_utils.rs
+++ b/lightning/src/ln/functional_test_utils.rs
@@ -422,6 +422,10 @@ pub struct Node<'chan_man, 'node_cfg: 'chan_man, 'chan_mon_cfg: 'node_cfg> {
                 &'chan_mon_cfg test_utils::TestLogger,
         >,
  }
+#[cfg(feature = "std")]
+impl<'a, 'b, 'c> std::panic::UnwindSafe for Node<'a, 'b, 'c> {}
+#[cfg(feature = "std")]
+impl<'a, 'b, 'c> std::panic::RefUnwindSafe for Node<'a, 'b, 'c> {}
  impl<'a, 'b, 'c> Node<'a, 'b, 'c> {
         pub fn best_block_hash(&self) -> BlockHash {
                 self.blocks.lock().unwrap().last().unwrap().0.block_hash()
diff --git a/lightning/src/util/persist.rs b/lightning/src/util/persist.rs

index af66e1233aba2965efafac46eff0e0a04bfd9fa3..431c62c9fb83de88830319e6b87b2f7370ac7f90 100644 (file)
--- a/lightning/src/util/persist.rs
+++ b/lightning/src/util/persist.rs
@@ -174,8 +174,8 @@ impl<'a, A: KVStore, M: Deref, T: Deref, ES: Deref, NS: Deref, SP: Deref, F: Der
  impl<ChannelSigner: WriteableEcdsaChannelSigner, K: KVStore> Persist<ChannelSigner> for K {
         // TODO: We really need a way for the persister to inform the user that its time to crash/shut
         // down once these start returning failure.
-       // An InProgress result implies we should probably just shut down the node since we're not
-       // retrying persistence!
+       // Then we should return InProgress rather than UnrecoverableError, implying we should probably
+       // just shut down the node since we're not retrying persistence!
  
         fn persist_new_channel(&self, funding_txo: OutPoint, monitor: &ChannelMonitor<ChannelSigner>, _update_id: MonitorUpdateId) -> chain::ChannelMonitorUpdateStatus {
                 let key = format!("{}_{}", funding_txo.txid.to_hex(), funding_txo.index);
@@ -185,7 +185,7 @@ impl<ChannelSigner: WriteableEcdsaChannelSigner, K: KVStore> Persist<ChannelSign
                         &key, &monitor.encode())
                 {
                         Ok(()) => chain::ChannelMonitorUpdateStatus::Completed,
-                       Err(_) => chain::ChannelMonitorUpdateStatus::InProgress
+                       Err(_) => chain::ChannelMonitorUpdateStatus::UnrecoverableError
                 }
         }
  
@@ -197,7 +197,7 @@ impl<ChannelSigner: WriteableEcdsaChannelSigner, K: KVStore> Persist<ChannelSign
                         &key, &monitor.encode())
                 {
                         Ok(()) => chain::ChannelMonitorUpdateStatus::Completed,
-                       Err(_) => chain::ChannelMonitorUpdateStatus::InProgress
+                       Err(_) => chain::ChannelMonitorUpdateStatus::UnrecoverableError
                 }
         }
  }
author	Matt Corallo <git@bluematt.me>
	Thu, 14 Sep 2023 20:02:46 +0000 (20:02 +0000)
committer	Matt Corallo <git@bluematt.me>
	Thu, 21 Sep 2023 19:12:31 +0000 (19:12 +0000)
lightning-persister/src/fs_store.rs		patch \| blob \| history
lightning/src/chain/chainmonitor.rs		patch \| blob \| history
lightning/src/chain/mod.rs		patch \| blob \| history
lightning/src/ln/channelmanager.rs		patch \| blob \| history
lightning/src/ln/functional_test_utils.rs		patch \| blob \| history
lightning/src/util/persist.rs		patch \| blob \| history