From 23c5308bcbea61d89b1e5d3df6b6da721bc74c2a Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sun, 10 Sep 2023 17:14:32 +0000 Subject: [PATCH] Drop the `ChannelMonitorUpdateStatus::PermanentFailure` variant When a `ChannelMonitorUpdate` fails to apply, it generally means we cannot reach our storage backend. This, in general, is a critical issue, but is often only a transient issue. Sadly, users see the failure variant and return it on any I/O error, resulting in channel force-closures due to transient issues. Users don't generally expect force-closes in most cases, and luckily with async `ChannelMonitorUpdate`s supported we don't take any risk by "delaying" the `ChannelMonitorUpdate` indefinitely. Thus, here we drop the `PermanentFailure` variant entirely, making all failures instead be "the update is in progress, but won't ever complete", which is equivalent if we do not close the channel automatically. --- fuzz/src/chanmon_consistency.rs | 4 +- lightning-persister/src/fs_store.rs | 6 +- lightning/src/chain/chainmonitor.rs | 74 +++-------- lightning/src/chain/channelmonitor.rs | 32 ++--- lightning/src/chain/mod.rs | 72 +++------- lightning/src/ln/chanmon_update_fail_tests.rs | 123 ++---------------- lightning/src/ln/channelmanager.rs | 106 +++++++-------- lightning/src/ln/functional_test_utils.rs | 4 +- lightning/src/ln/functional_tests.rs | 12 +- lightning/src/ln/reload_tests.rs | 2 +- lightning/src/util/persist.rs | 8 +- lightning/src/util/test_utils.rs | 2 +- 12 files changed, 129 insertions(+), 316 deletions(-) diff --git a/fuzz/src/chanmon_consistency.rs b/fuzz/src/chanmon_consistency.rs index b6d41fb99..6a2007165 100644 --- a/fuzz/src/chanmon_consistency.rs +++ b/fuzz/src/chanmon_consistency.rs @@ -138,7 +138,7 @@ impl TestChainMonitor { } } impl chain::Watch for TestChainMonitor { - fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor) -> chain::ChannelMonitorUpdateStatus { + fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor) -> Result { let mut ser = VecWriter(Vec::new()); monitor.write(&mut ser).unwrap(); if let Some(_) = self.latest_monitors.lock().unwrap().insert(funding_txo, (monitor.get_latest_update_id(), ser.0)) { @@ -500,7 +500,7 @@ pub fn do_test(data: &[u8], underlying_out: Out) { let res = (<(BlockHash, ChanMan)>::read(&mut Cursor::new(&$ser.0), read_args).expect("Failed to read manager").1, chain_monitor.clone()); for (funding_txo, mon) in monitors.drain() { assert_eq!(chain_monitor.chain_monitor.watch_channel(funding_txo, mon), - ChannelMonitorUpdateStatus::Completed); + Ok(ChannelMonitorUpdateStatus::Completed)); } res } } diff --git a/lightning-persister/src/fs_store.rs b/lightning-persister/src/fs_store.rs index 81f9709c4..6725e1697 100644 --- a/lightning-persister/src/fs_store.rs +++ b/lightning-persister/src/fs_store.rs @@ -436,7 +436,7 @@ mod tests { } // Test that if the store's path to channel data is read-only, writing a - // monitor to it results in the store returning a PermanentFailure. + // monitor to it results in the store returning an InProgress. // Windows ignores the read-only flag for folders, so this test is Unix-only. #[cfg(not(target_os = "windows"))] #[test] @@ -470,7 +470,7 @@ mod tests { index: 0 }; match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) { - ChannelMonitorUpdateStatus::PermanentFailure => {}, + ChannelMonitorUpdateStatus::InProgress => {}, _ => panic!("unexpected result from persisting new channel") } @@ -507,7 +507,7 @@ mod tests { index: 0 }; match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) { - ChannelMonitorUpdateStatus::PermanentFailure => {}, + ChannelMonitorUpdateStatus::InProgress => {}, _ => panic!("unexpected result from persisting new channel") } diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs index b3b62a2e8..e17e2dbd3 100644 --- a/lightning/src/chain/chainmonitor.rs +++ b/lightning/src/chain/chainmonitor.rs @@ -78,7 +78,7 @@ impl MonitorUpdateId { /// `Persist` defines behavior for persisting channel monitors: this could mean /// writing once to disk, and/or uploading to one or more backup services. /// -/// Each method can return three possible values: +/// Each method can return two possible values: /// * If persistence (including any relevant `fsync()` calls) happens immediately, the /// implementation should return [`ChannelMonitorUpdateStatus::Completed`], indicating normal /// channel operation should continue. @@ -91,10 +91,9 @@ impl MonitorUpdateId { /// Note that unlike the direct [`chain::Watch`] interface, /// [`ChainMonitor::channel_monitor_updated`] must be called once for *each* update which occurs. /// -/// * If persistence fails for some reason, implementations should return -/// [`ChannelMonitorUpdateStatus::PermanentFailure`], in which case the channel will likely be -/// closed without broadcasting the latest state. See -/// [`ChannelMonitorUpdateStatus::PermanentFailure`] for more details. +/// If persistence fails for some reason, implementations should still return +/// [`ChannelMonitorUpdateStatus::InProgress`] and attempt to shut down or otherwise resolve the +/// situation ASAP. /// /// Third-party watchtowers may be built as a part of an implementation of this trait, with the /// advantage that you can control whether to resume channel operation depending on if an update @@ -335,11 +334,6 @@ where C::Target: chain::Filter, match self.persister.update_persisted_channel(*funding_outpoint, None, monitor, update_id) { ChannelMonitorUpdateStatus::Completed => log_trace!(self.logger, "Finished syncing Channel Monitor for channel {}", log_funding_info!(monitor)), - ChannelMonitorUpdateStatus::PermanentFailure => { - monitor_state.channel_perm_failed.store(true, Ordering::Release); - self.pending_monitor_events.lock().unwrap().push((*funding_outpoint, vec![MonitorEvent::UpdateFailed(*funding_outpoint)], monitor.get_counterparty_node_id())); - self.event_notifier.notify(); - } ChannelMonitorUpdateStatus::InProgress => { log_debug!(self.logger, "Channel Monitor sync for channel {} in progress, holding events until completion!", log_funding_info!(monitor)); pending_monitor_updates.push(update_id); @@ -673,12 +667,12 @@ where C::Target: chain::Filter, /// /// Note that we persist the given `ChannelMonitor` while holding the `ChainMonitor` /// monitors lock. - fn watch_channel(&self, funding_outpoint: OutPoint, monitor: ChannelMonitor) -> ChannelMonitorUpdateStatus { + fn watch_channel(&self, funding_outpoint: OutPoint, monitor: ChannelMonitor) -> Result { let mut monitors = self.monitors.write().unwrap(); let entry = match monitors.entry(funding_outpoint) { hash_map::Entry::Occupied(_) => { log_error!(self.logger, "Failed to add new channel data: channel monitor for given outpoint is already present"); - return ChannelMonitorUpdateStatus::PermanentFailure + return Err(()); }, hash_map::Entry::Vacant(e) => e, }; @@ -691,10 +685,6 @@ where C::Target: chain::Filter, log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} in progress", log_funding_info!(monitor)); pending_monitor_updates.push(update_id); }, - ChannelMonitorUpdateStatus::PermanentFailure => { - log_error!(self.logger, "Persistence of new ChannelMonitor for channel {} failed", log_funding_info!(monitor)); - return persist_res; - }, ChannelMonitorUpdateStatus::Completed => { log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} completed", log_funding_info!(monitor)); } @@ -708,7 +698,7 @@ where C::Target: chain::Filter, channel_perm_failed: AtomicBool::new(false), last_chain_persist_height: AtomicUsize::new(self.highest_chain_height.load(Ordering::Acquire)), }); - persist_res + Ok(persist_res) } /// Note that we persist the given `ChannelMonitor` update while holding the @@ -723,10 +713,10 @@ where C::Target: chain::Filter, // We should never ever trigger this from within ChannelManager. Technically a // user could use this object with some proxying in between which makes this // possible, but in tests and fuzzing, this should be a panic. - #[cfg(any(test, fuzzing))] + #[cfg(debug_assertions)] panic!("ChannelManager generated a channel update for a channel that was not yet registered!"); - #[cfg(not(any(test, fuzzing)))] - ChannelMonitorUpdateStatus::PermanentFailure + #[cfg(not(debug_assertions))] + ChannelMonitorUpdateStatus::InProgress }, Some(monitor_state) => { let monitor = &monitor_state.monitor; @@ -745,18 +735,14 @@ where C::Target: chain::Filter, pending_monitor_updates.push(update_id); log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} in progress", log_funding_info!(monitor)); }, - ChannelMonitorUpdateStatus::PermanentFailure => { - monitor_state.channel_perm_failed.store(true, Ordering::Release); - log_error!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} failed", log_funding_info!(monitor)); - }, ChannelMonitorUpdateStatus::Completed => { log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} completed", log_funding_info!(monitor)); }, } if update_res.is_err() { - ChannelMonitorUpdateStatus::PermanentFailure + ChannelMonitorUpdateStatus::InProgress } else if monitor_state.channel_perm_failed.load(Ordering::Acquire) { - ChannelMonitorUpdateStatus::PermanentFailure + ChannelMonitorUpdateStatus::InProgress } else { persist_res } @@ -831,12 +817,12 @@ impl ChannelMonitor { self.inner.lock().unwrap().counterparty_node_id } - /// Used by ChannelManager deserialization to broadcast the latest holder state if its copy of - /// the Channel was out-of-date. + /// Used by [`ChannelManager`] deserialization to broadcast the latest holder state if its copy + /// of the channel state was out-of-date. /// /// You may also use this to broadcast the latest local commitment transaction, either because - /// a monitor update failed with [`ChannelMonitorUpdateStatus::PermanentFailure`] or because we've - /// fallen behind (i.e. we've received proof that our counterparty side knows a revocation - /// secret we gave them that they shouldn't know). + /// a monitor update failed or because we've fallen behind (i.e. we've received proof that our + /// counterparty side knows a revocation secret we gave them that they shouldn't know). /// /// Broadcasting these transactions in the second case is UNSAFE, as they allow counterparty /// side to punish you. Nevertheless you may want to broadcast them if counterparty doesn't /// close channel with their commitment transaction after a substantial amount of time. Best /// may be to contact the other node operator out-of-band to coordinate other options available - /// to you. In any-case, the choice is up to you. + /// to you. /// - /// [`ChannelMonitorUpdateStatus::PermanentFailure`]: super::ChannelMonitorUpdateStatus::PermanentFailure + /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager pub fn get_latest_holder_commitment_txn(&self, logger: &L) -> Vec where L::Target: Logger { self.inner.lock().unwrap().get_latest_holder_commitment_txn(logger) @@ -2599,6 +2595,7 @@ impl ChannelMonitorImpl { ChannelMonitorUpdateStep::CommitmentSecret { idx, secret } => { log_trace!(logger, "Updating ChannelMonitor with commitment secret"); if let Err(e) = self.provide_secret(*idx, *secret) { + debug_assert!(false, "Latest counterparty commitment secret was invalid"); log_error!(logger, "Providing latest counterparty commitment secret failed/was refused:"); log_error!(logger, " {}", e); ret = Err(()); @@ -4413,13 +4410,12 @@ mod tests { use crate::chain::chaininterface::LowerBoundedFeeEstimator; use super::ChannelMonitorUpdateStep; - use crate::{check_added_monitors, check_closed_broadcast, check_closed_event, check_spends, get_local_commitment_txn, get_monitor, get_route_and_payment_hash, unwrap_send_err}; + use crate::{check_added_monitors, check_spends, get_local_commitment_txn, get_monitor, get_route_and_payment_hash, unwrap_send_err}; use crate::chain::{BestBlock, Confirm}; use crate::chain::channelmonitor::ChannelMonitor; use crate::chain::package::{weight_offered_htlc, weight_received_htlc, weight_revoked_offered_htlc, weight_revoked_received_htlc, WEIGHT_REVOKED_OUTPUT}; use crate::chain::transaction::OutPoint; use crate::sign::InMemorySigner; - use crate::events::ClosureReason; use crate::ln::{PaymentPreimage, PaymentHash}; use crate::ln::chan_utils; use crate::ln::chan_utils::{HTLCOutputInCommitment, ChannelPublicKeys, ChannelTransactionParameters, HolderCommitmentTransaction, CounterpartyChannelTransactionParameters}; @@ -4485,18 +4481,14 @@ mod tests { let (route, payment_hash, _, payment_secret) = get_route_and_payment_hash!(nodes[1], nodes[0], 100_000); unwrap_send_err!(nodes[1].node.send_payment_with_route(&route, payment_hash, RecipientOnionFields::secret_only(payment_secret), PaymentId(payment_hash.0) - ), true, APIError::ChannelUnavailable { ref err }, - assert!(err.contains("ChannelMonitor storage failure"))); - check_added_monitors!(nodes[1], 2); // After the failure we generate a close-channel monitor update - check_closed_broadcast!(nodes[1], true); - check_closed_event!(nodes[1], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() }, - [nodes[0].node.get_our_node_id()], 100000); + ), false, APIError::MonitorUpdateInProgress, {}); + check_added_monitors!(nodes[1], 1); // Build a new ChannelMonitorUpdate which contains both the failing commitment tx update // and provides the claim preimages for the two pending HTLCs. The first update generates // an error, but the point of this test is to ensure the later updates are still applied. let monitor_updates = nodes[1].chain_monitor.monitor_updates.lock().unwrap(); - let mut replay_update = monitor_updates.get(&channel.2).unwrap().iter().rev().skip(1).next().unwrap().clone(); + let mut replay_update = monitor_updates.get(&channel.2).unwrap().iter().rev().next().unwrap().clone(); assert_eq!(replay_update.updates.len(), 1); if let ChannelMonitorUpdateStep::LatestCounterpartyCommitmentTXInfo { .. } = replay_update.updates[0] { } else { panic!(); } diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs index 236b10a7b..736da139a 100644 --- a/lightning/src/chain/mod.rs +++ b/lightning/src/chain/mod.rs @@ -192,10 +192,6 @@ pub enum ChannelMonitorUpdateStatus { /// have been successfully applied, a [`MonitorEvent::Completed`] can be used to restore the /// channel to an operational state. /// - /// Note that a given [`ChannelManager`] will *never* re-generate a [`ChannelMonitorUpdate`]. - /// If you return this error you must ensure that it is written to disk safely before writing - /// the latest [`ChannelManager`] state, or you should return [`PermanentFailure`] instead. - /// /// Even when a channel has been "frozen", updates to the [`ChannelMonitor`] can continue to /// occur (e.g. if an inbound HTLC which we forwarded was claimed upstream, resulting in us /// attempting to claim it on this channel) and those updates must still be persisted. @@ -208,49 +204,8 @@ pub enum ChannelMonitorUpdateStatus { /// remote location (with local copies persisted immediately), it is anticipated that all /// updates will return [`InProgress`] until the remote copies could be updated. /// - /// [`PermanentFailure`]: ChannelMonitorUpdateStatus::PermanentFailure /// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress - /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager InProgress, - /// Used to indicate no further channel monitor updates will be allowed (likely a disk failure - /// or a remote copy of this [`ChannelMonitor`] is no longer reachable and thus not updatable). - /// - /// When this is returned, [`ChannelManager`] will force-close the channel but *not* broadcast - /// our current commitment transaction. This avoids a dangerous case where a local disk failure - /// (e.g. the Linux-default remounting of the disk as read-only) causes [`PermanentFailure`]s - /// for all monitor updates. If we were to broadcast our latest commitment transaction and then - /// restart, we could end up reading a previous [`ChannelMonitor`] and [`ChannelManager`], - /// revoking our now-broadcasted state before seeing it confirm and losing all our funds. - /// - /// Note that this is somewhat of a tradeoff - if the disk is really gone and we may have lost - /// the data permanently, we really should broadcast immediately. If the data can be recovered - /// with manual intervention, we'd rather close the channel, rejecting future updates to it, - /// and broadcast the latest state only if we have HTLCs to claim which are timing out (which - /// we do as long as blocks are connected). - /// - /// In order to broadcast the latest local commitment transaction, you'll need to call - /// [`ChannelMonitor::get_latest_holder_commitment_txn`] and broadcast the resulting - /// transactions once you've safely ensured no further channel updates can be generated by your - /// [`ChannelManager`]. - /// - /// Note that at least one final [`ChannelMonitorUpdate`] may still be provided, which must - /// still be processed by a running [`ChannelMonitor`]. This final update will mark the - /// [`ChannelMonitor`] as finalized, ensuring no further updates (e.g. revocation of the latest - /// commitment transaction) are allowed. - /// - /// Note that even if you return a [`PermanentFailure`] due to unavailability of secondary - /// [`ChannelMonitor`] copies, you should still make an attempt to store the update where - /// possible to ensure you can claim HTLC outputs on the latest commitment transaction - /// broadcasted later. - /// - /// In case of distributed watchtowers deployment, the new version must be written to disk, as - /// state may have been stored but rejected due to a block forcing a commitment broadcast. This - /// storage is used to claim outputs of rejected state confirmed onchain by another watchtower, - /// lagging behind on block processing. - /// - /// [`PermanentFailure`]: ChannelMonitorUpdateStatus::PermanentFailure - /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager - PermanentFailure, } /// The `Watch` trait defines behavior for watching on-chain activity pertaining to channels as @@ -262,16 +217,13 @@ pub enum ChannelMonitorUpdateStatus { /// requirements. /// /// Implementations **must** ensure that updates are successfully applied and persisted upon method -/// completion. If an update fails with a [`PermanentFailure`], then it must immediately shut down -/// without taking any further action such as persisting the current state. +/// completion. If an update will not succeed, then it must immediately shut down. /// /// If an implementation maintains multiple instances of a channel's monitor (e.g., by storing /// backup copies), then it must ensure that updates are applied across all instances. Otherwise, it /// could result in a revoked transaction being broadcast, allowing the counterparty to claim all /// funds in the channel. See [`ChannelMonitorUpdateStatus`] for more details about how to handle /// multiple instances. -/// -/// [`PermanentFailure`]: ChannelMonitorUpdateStatus::PermanentFailure pub trait Watch { /// Watches a channel identified by `funding_txo` using `monitor`. /// @@ -279,20 +231,30 @@ pub trait Watch { /// with any spends of outputs returned by [`get_outputs_to_watch`]. In practice, this means /// calling [`block_connected`] and [`block_disconnected`] on the monitor. /// - /// Note: this interface MUST error with [`ChannelMonitorUpdateStatus::PermanentFailure`] if - /// the given `funding_txo` has previously been registered via `watch_channel`. + /// A return of `Err(())` indicates that the channel should immediately be force-closed without + /// broadcasting the funding transaction. + /// + /// If the given `funding_txo` has previously been registered via `watch_channel`, `Err(())` + /// must be returned. /// /// [`get_outputs_to_watch`]: channelmonitor::ChannelMonitor::get_outputs_to_watch /// [`block_connected`]: channelmonitor::ChannelMonitor::block_connected /// [`block_disconnected`]: channelmonitor::ChannelMonitor::block_disconnected - fn watch_channel(&self, funding_txo: OutPoint, monitor: ChannelMonitor) -> ChannelMonitorUpdateStatus; + fn watch_channel(&self, funding_txo: OutPoint, monitor: ChannelMonitor) -> Result; /// Updates a channel identified by `funding_txo` by applying `update` to its monitor. /// - /// Implementations must call [`update_monitor`] with the given update. See - /// [`ChannelMonitorUpdateStatus`] for invariants around returning an error. + /// Implementations must call [`ChannelMonitor::update_monitor`] with the given update. This + /// may fail (returning an `Err(())`), in which case this should return + /// [`ChannelMonitorUpdateStatus::InProgress`] (and the update should never complete). This + /// generally implies the channel has been closed (either by the funding outpoint being spent + /// on-chain or the [`ChannelMonitor`] having decided to do so and broadcasted a transaction), + /// and the [`ChannelManager`] state will be updated once it sees the funding spend on-chain. + /// + /// If persistence fails, this should return [`ChannelMonitorUpdateStatus::InProgress`] and + /// the node should shut down immediately. /// - /// [`update_monitor`]: channelmonitor::ChannelMonitor::update_monitor + /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager fn update_channel(&self, funding_txo: OutPoint, update: &ChannelMonitorUpdate) -> ChannelMonitorUpdateStatus; /// Returns any monitor events since the last call. Subsequent calls must only return new diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs index 33f4bbc8c..dc9bbd3e4 100644 --- a/lightning/src/ln/chanmon_update_fail_tests.rs +++ b/lightning/src/ln/chanmon_update_fail_tests.rs @@ -37,43 +37,6 @@ use bitcoin::hashes::Hash; use crate::prelude::*; use crate::sync::{Arc, Mutex}; -#[test] -fn test_simple_monitor_permanent_update_fail() { - // Test that we handle a simple permanent monitor update failure - let chanmon_cfgs = create_chanmon_cfgs(2); - let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); - let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); - let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); - create_announced_chan_between_nodes(&nodes, 0, 1); - - let (route, payment_hash_1, _, payment_secret_1) = get_route_and_payment_hash!(&nodes[0], nodes[1], 1000000); - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure); - unwrap_send_err!(nodes[0].node.send_payment_with_route(&route, payment_hash_1, - RecipientOnionFields::secret_only(payment_secret_1), PaymentId(payment_hash_1.0) - ), true, APIError::ChannelUnavailable {..}, {}); - check_added_monitors!(nodes[0], 2); - - let events_1 = nodes[0].node.get_and_clear_pending_msg_events(); - assert_eq!(events_1.len(), 2); - match events_1[0] { - MessageSendEvent::BroadcastChannelUpdate { .. } => {}, - _ => panic!("Unexpected event"), - }; - match events_1[1] { - MessageSendEvent::HandleError { node_id, .. } => assert_eq!(node_id, nodes[1].node.get_our_node_id()), - _ => panic!("Unexpected event"), - }; - - assert!(nodes[0].tx_broadcaster.txn_broadcasted.lock().unwrap().is_empty()); - - // TODO: Once we hit the chain with the failure transaction we should check that we get a - // PaymentPathFailed event - - assert_eq!(nodes[0].node.list_channels().len(), 0); - check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() }, - [nodes[1].node.get_our_node_id()], 100000); -} - #[test] fn test_monitor_and_persister_update_fail() { // Test that if both updating the `ChannelMonitor` and persisting the updated @@ -117,14 +80,11 @@ fn test_monitor_and_persister_update_fail() { new_monitor }; let chain_mon = test_utils::TestChainMonitor::new(Some(&chain_source), &tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager); - assert_eq!(chain_mon.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed); + assert_eq!(chain_mon.watch_channel(outpoint, new_monitor), Ok(ChannelMonitorUpdateStatus::Completed)); chain_mon }; chain_mon.chain_monitor.block_connected(&create_dummy_block(BlockHash::all_zeros(), 42, Vec::new()), 200); - // Set the persister's return value to be a InProgress. - persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress); - // Try to update ChannelMonitor nodes[1].node.claim_funds(preimage); expect_payment_claimed!(nodes[1], payment_hash, 9_000_000); @@ -133,17 +93,21 @@ fn test_monitor_and_persister_update_fail() { let updates = get_htlc_update_msgs!(nodes[1], nodes[0].node.get_our_node_id()); assert_eq!(updates.update_fulfill_htlcs.len(), 1); nodes[0].node.handle_update_fulfill_htlc(&nodes[1].node.get_our_node_id(), &updates.update_fulfill_htlcs[0]); + { let mut node_0_per_peer_lock; let mut node_0_peer_state_lock; if let ChannelPhase::Funded(ref mut channel) = get_channel_ref!(nodes[0], nodes[1], node_0_per_peer_lock, node_0_peer_state_lock, chan.2) { if let Ok(Some(update)) = channel.commitment_signed(&updates.commitment_signed, &node_cfgs[0].logger) { - // Check that even though the persister is returning a InProgress, - // because the update is bogus, ultimately the error that's returned - // should be a PermanentFailure. - if let ChannelMonitorUpdateStatus::PermanentFailure = chain_mon.chain_monitor.update_channel(outpoint, &update) {} else { panic!("Expected monitor error to be permanent"); } - logger.assert_log_regex("lightning::chain::chainmonitor", regex::Regex::new("Persistence of ChannelMonitorUpdate for channel [0-9a-f]* in progress").unwrap(), 1); - assert_eq!(nodes[0].chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::Completed); + // Check that the persister returns InProgress (and will never actually complete) + // as the monitor update errors. + if let ChannelMonitorUpdateStatus::InProgress = chain_mon.chain_monitor.update_channel(outpoint, &update) {} else { panic!("Expected monitor paused"); } + logger.assert_log_regex("lightning::chain::chainmonitor", regex::Regex::new("Failed to update ChannelMonitor for channel [0-9a-f]*.").unwrap(), 1); + + // Apply the monitor update to the original ChainMonitor, ensuring the + // ChannelManager and ChannelMonitor aren't out of sync. + assert_eq!(nodes[0].chain_monitor.update_channel(outpoint, &update), + ChannelMonitorUpdateStatus::Completed); } else { assert!(false); } } else { assert!(false); @@ -151,8 +115,7 @@ fn test_monitor_and_persister_update_fail() { } check_added_monitors!(nodes[0], 1); - let events = nodes[0].node.get_and_clear_pending_events(); - assert_eq!(events.len(), 1); + expect_payment_sent(&nodes[0], preimage, None, false, false); } fn do_test_simple_monitor_temporary_update_fail(disconnect: bool) { @@ -2675,68 +2638,6 @@ fn test_temporary_error_during_shutdown() { check_closed_event!(nodes[0], 1, ClosureReason::CooperativeClosure, [nodes[1].node.get_our_node_id()], 100000); } -#[test] -fn test_permanent_error_during_sending_shutdown() { - // Test that permanent failures when updating the monitor's shutdown script result in a force - // close when initiating a cooperative close. - let mut config = test_default_channel_config(); - config.channel_handshake_config.commit_upfront_shutdown_pubkey = false; - - let chanmon_cfgs = create_chanmon_cfgs(2); - let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); - let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[Some(config), None]); - let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); - - let channel_id = create_announced_chan_between_nodes(&nodes, 0, 1).2; - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure); - - assert!(nodes[0].node.close_channel(&channel_id, &nodes[1].node.get_our_node_id()).is_ok()); - - // We always send the `shutdown` response when initiating a shutdown, even if we immediately - // close the channel thereafter. - let msg_events = nodes[0].node.get_and_clear_pending_msg_events(); - assert_eq!(msg_events.len(), 3); - if let MessageSendEvent::SendShutdown { .. } = msg_events[0] {} else { panic!(); } - if let MessageSendEvent::BroadcastChannelUpdate { .. } = msg_events[1] {} else { panic!(); } - if let MessageSendEvent::HandleError { .. } = msg_events[2] {} else { panic!(); } - - check_added_monitors!(nodes[0], 2); - check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() }, - [nodes[1].node.get_our_node_id()], 100000); -} - -#[test] -fn test_permanent_error_during_handling_shutdown() { - // Test that permanent failures when updating the monitor's shutdown script result in a force - // close when handling a cooperative close. - let mut config = test_default_channel_config(); - config.channel_handshake_config.commit_upfront_shutdown_pubkey = false; - - let chanmon_cfgs = create_chanmon_cfgs(2); - let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); - let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, Some(config)]); - let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); - - let channel_id = create_announced_chan_between_nodes(&nodes, 0, 1).2; - chanmon_cfgs[1].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure); - - assert!(nodes[0].node.close_channel(&channel_id, &nodes[1].node.get_our_node_id()).is_ok()); - let shutdown = get_event_msg!(nodes[0], MessageSendEvent::SendShutdown, nodes[1].node.get_our_node_id()); - nodes[1].node.handle_shutdown(&nodes[0].node.get_our_node_id(), &shutdown); - - // We always send the `shutdown` response when receiving a shutdown, even if we immediately - // close the channel thereafter. - let msg_events = nodes[1].node.get_and_clear_pending_msg_events(); - assert_eq!(msg_events.len(), 3); - if let MessageSendEvent::SendShutdown { .. } = msg_events[0] {} else { panic!(); } - if let MessageSendEvent::BroadcastChannelUpdate { .. } = msg_events[1] {} else { panic!(); } - if let MessageSendEvent::HandleError { .. } = msg_events[2] {} else { panic!(); } - - check_added_monitors!(nodes[1], 2); - check_closed_event!(nodes[1], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() }, - [nodes[0].node.get_our_node_id()], 100000); -} - #[test] fn double_temp_error() { // Test that it's OK to have multiple `ChainMonitor::update_channel` calls fail in a row. diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index 0cadbd41a..35735c389 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -2051,17 +2051,6 @@ macro_rules! handle_new_monitor_update { &$chan.context.channel_id()); Ok(false) }, - ChannelMonitorUpdateStatus::PermanentFailure => { - log_error!($self.logger, "Closing channel {} due to monitor update ChannelMonitorUpdateStatus::PermanentFailure", - &$chan.context.channel_id()); - update_maps_on_chan_removal!($self, &$chan.context); - let res = Err(MsgHandleErrInternal::from_finish_shutdown( - "ChannelMonitor storage failure".to_owned(), $chan.context.channel_id(), - $chan.context.get_user_id(), $chan.context.force_shutdown(false), - $self.get_channel_update_for_broadcast(&$chan).ok(), $chan.context.get_value_satoshis())); - $remove; - res - }, ChannelMonitorUpdateStatus::Completed => { $completed; Ok(true) @@ -5919,47 +5908,55 @@ where Err(MsgHandleErrInternal::send_err_msg_no_close("Already had channel with the new channel_id".to_owned(), funding_msg.channel_id)) }, hash_map::Entry::Vacant(e) => { - match self.id_to_peer.lock().unwrap().entry(chan.context.channel_id()) { + let mut id_to_peer_lock = self.id_to_peer.lock().unwrap(); + match id_to_peer_lock.entry(chan.context.channel_id()) { hash_map::Entry::Occupied(_) => { return Err(MsgHandleErrInternal::send_err_msg_no_close( "The funding_created message had the same funding_txid as an existing channel - funding is not possible".to_owned(), funding_msg.channel_id)) }, hash_map::Entry::Vacant(i_e) => { - i_e.insert(chan.context.get_counterparty_node_id()); - } - } - - // There's no problem signing a counterparty's funding transaction if our monitor - // hasn't persisted to disk yet - we can't lose money on a transaction that we haven't - // accepted payment from yet. We do, however, need to wait to send our channel_ready - // until we have persisted our monitor. - let new_channel_id = funding_msg.channel_id; - peer_state.pending_msg_events.push(events::MessageSendEvent::SendFundingSigned { - node_id: counterparty_node_id.clone(), - msg: funding_msg, - }); + let monitor_res = self.chain_monitor.watch_channel(monitor.get_funding_txo().0, monitor); + if let Ok(persist_state) = monitor_res { + i_e.insert(chan.context.get_counterparty_node_id()); + mem::drop(id_to_peer_lock); + + // There's no problem signing a counterparty's funding transaction if our monitor + // hasn't persisted to disk yet - we can't lose money on a transaction that we haven't + // accepted payment from yet. We do, however, need to wait to send our channel_ready + // until we have persisted our monitor. + let new_channel_id = funding_msg.channel_id; + peer_state.pending_msg_events.push(events::MessageSendEvent::SendFundingSigned { + node_id: counterparty_node_id.clone(), + msg: funding_msg, + }); - let monitor_res = self.chain_monitor.watch_channel(monitor.get_funding_txo().0, monitor); - - if let ChannelPhase::Funded(chan) = e.insert(ChannelPhase::Funded(chan)) { - let mut res = handle_new_monitor_update!(self, monitor_res, peer_state_lock, peer_state, - per_peer_state, chan, MANUALLY_REMOVING_INITIAL_MONITOR, - { peer_state.channel_by_id.remove(&new_channel_id) }); - - // Note that we reply with the new channel_id in error messages if we gave up on the - // channel, not the temporary_channel_id. This is compatible with ourselves, but the - // spec is somewhat ambiguous here. Not a huge deal since we'll send error messages for - // any messages referencing a previously-closed channel anyway. - // We do not propagate the monitor update to the user as it would be for a monitor - // that we didn't manage to store (and that we don't care about - we don't respond - // with the funding_signed so the channel can never go on chain). - if let Err(MsgHandleErrInternal { shutdown_finish: Some((res, _)), .. }) = &mut res { - res.0 = None; + if let ChannelPhase::Funded(chan) = e.insert(ChannelPhase::Funded(chan)) { + let mut res = handle_new_monitor_update!(self, persist_state, peer_state_lock, peer_state, + per_peer_state, chan, MANUALLY_REMOVING_INITIAL_MONITOR, + { peer_state.channel_by_id.remove(&new_channel_id) }); + + // Note that we reply with the new channel_id in error messages if we gave up on the + // channel, not the temporary_channel_id. This is compatible with ourselves, but the + // spec is somewhat ambiguous here. Not a huge deal since we'll send error messages for + // any messages referencing a previously-closed channel anyway. + // We do not propagate the monitor update to the user as it would be for a monitor + // that we didn't manage to store (and that we don't care about - we don't respond + // with the funding_signed so the channel can never go on chain). + if let Err(MsgHandleErrInternal { shutdown_finish: Some((res, _)), .. }) = &mut res { + res.0 = None; + } + res.map(|_| ()) + } else { + unreachable!("This must be a funded channel as we just inserted it."); + } + } else { + log_error!(self.logger, "Persisting initial ChannelMonitor failed, implying the funding outpoint was duplicated"); + return Err(MsgHandleErrInternal::send_err_msg_no_close( + "The funding_created message had the same funding_txid as an existing channel - funding is not possible".to_owned(), + funding_msg.channel_id)); + } } - res.map(|_| ()) - } else { - unreachable!("This must be a funded channel as we just inserted it."); } } } @@ -5982,17 +5979,20 @@ where ChannelPhase::Funded(ref mut chan) => { let monitor = try_chan_phase_entry!(self, chan.funding_signed(&msg, best_block, &self.signer_provider, &self.logger), chan_phase_entry); - let update_res = self.chain_monitor.watch_channel(chan.context.get_funding_txo().unwrap(), monitor); - let mut res = handle_new_monitor_update!(self, update_res, peer_state_lock, peer_state, per_peer_state, chan_phase_entry, INITIAL_MONITOR); - if let Err(MsgHandleErrInternal { ref mut shutdown_finish, .. }) = res { - // We weren't able to watch the channel to begin with, so no updates should be made on - // it. Previously, full_stack_target found an (unreachable) panic when the - // monitor update contained within `shutdown_finish` was applied. - if let Some((ref mut shutdown_finish, _)) = shutdown_finish { - shutdown_finish.0.take(); + if let Ok(persist_status) = self.chain_monitor.watch_channel(chan.context.get_funding_txo().unwrap(), monitor) { + let mut res = handle_new_monitor_update!(self, persist_status, peer_state_lock, peer_state, per_peer_state, chan_phase_entry, INITIAL_MONITOR); + if let Err(MsgHandleErrInternal { ref mut shutdown_finish, .. }) = res { + // We weren't able to watch the channel to begin with, so no updates should be made on + // it. Previously, full_stack_target found an (unreachable) panic when the + // monitor update contained within `shutdown_finish` was applied. + if let Some((ref mut shutdown_finish, _)) = shutdown_finish { + shutdown_finish.0.take(); + } } + res.map(|_| ()) + } else { + try_chan_phase_entry!(self, Err(ChannelError::Close("Channel funding outpoint was a duplicate".to_owned())), chan_phase_entry) } - res.map(|_| ()) }, _ => { return Err(MsgHandleErrInternal::send_err_msg_no_close("Failed to find corresponding channel".to_owned(), msg.channel_id)); diff --git a/lightning/src/ln/functional_test_utils.rs b/lightning/src/ln/functional_test_utils.rs index f6684485d..2c4c3c0c2 100644 --- a/lightning/src/ln/functional_test_utils.rs +++ b/lightning/src/ln/functional_test_utils.rs @@ -578,7 +578,7 @@ impl<'a, 'b, 'c> Drop for Node<'a, 'b, 'c> { let chain_source = test_utils::TestChainSource::new(Network::Testnet); let chain_monitor = test_utils::TestChainMonitor::new(Some(&chain_source), &broadcaster, &self.logger, &feeest, &persister, &self.keys_manager); for deserialized_monitor in deserialized_monitors.drain(..) { - if chain_monitor.watch_channel(deserialized_monitor.get_funding_txo().0, deserialized_monitor) != ChannelMonitorUpdateStatus::Completed { + if chain_monitor.watch_channel(deserialized_monitor.get_funding_txo().0, deserialized_monitor) != Ok(ChannelMonitorUpdateStatus::Completed) { panic!(); } } @@ -977,7 +977,7 @@ pub fn _reload_node<'a, 'b, 'c>(node: &'a Node<'a, 'b, 'c>, default_config: User for monitor in monitors_read.drain(..) { assert_eq!(node.chain_monitor.watch_channel(monitor.get_funding_txo().0, monitor), - ChannelMonitorUpdateStatus::Completed); + Ok(ChannelMonitorUpdateStatus::Completed)); check_added_monitors!(node, 1); } diff --git a/lightning/src/ln/functional_tests.rs b/lightning/src/ln/functional_tests.rs index b252a352c..cf27d6787 100644 --- a/lightning/src/ln/functional_tests.rs +++ b/lightning/src/ln/functional_tests.rs @@ -2419,7 +2419,7 @@ fn channel_monitor_network_test() { assert_eq!(nodes[4].node.list_channels().len(), 0); assert_eq!(nodes[3].chain_monitor.chain_monitor.watch_channel(OutPoint { txid: chan_3.3.txid(), index: 0 }, chan_3_mon), - ChannelMonitorUpdateStatus::Completed); + Ok(ChannelMonitorUpdateStatus::Completed)); check_closed_event!(nodes[3], 1, ClosureReason::CommitmentTxConfirmed, [nodes[4].node.get_our_node_id()], 100000); check_closed_event!(nodes[4], 1, ClosureReason::CommitmentTxConfirmed, [nodes[3].node.get_our_node_id()], 100000); } @@ -8453,7 +8453,7 @@ fn test_update_err_monitor_lockdown() { new_monitor }; let watchtower = test_utils::TestChainMonitor::new(Some(&chain_source), &chanmon_cfgs[0].tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager); - assert_eq!(watchtower.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed); + assert_eq!(watchtower.watch_channel(outpoint, new_monitor), Ok(ChannelMonitorUpdateStatus::Completed)); watchtower }; let block = create_dummy_block(BlockHash::all_zeros(), 42, Vec::new()); @@ -8475,7 +8475,7 @@ fn test_update_err_monitor_lockdown() { let mut node_0_peer_state_lock; if let ChannelPhase::Funded(ref mut channel) = get_channel_ref!(nodes[0], nodes[1], node_0_per_peer_lock, node_0_peer_state_lock, chan_1.2) { if let Ok(Some(update)) = channel.commitment_signed(&updates.commitment_signed, &node_cfgs[0].logger) { - assert_eq!(watchtower.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::PermanentFailure); + assert_eq!(watchtower.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::InProgress); assert_eq!(nodes[0].chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::Completed); } else { assert!(false); } } else { @@ -8526,7 +8526,7 @@ fn test_concurrent_monitor_claim() { new_monitor }; let watchtower = test_utils::TestChainMonitor::new(Some(&chain_source), &alice_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager); - assert_eq!(watchtower.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed); + assert_eq!(watchtower.watch_channel(outpoint, new_monitor), Ok(ChannelMonitorUpdateStatus::Completed)); watchtower }; let block = create_dummy_block(BlockHash::all_zeros(), 42, Vec::new()); @@ -8557,7 +8557,7 @@ fn test_concurrent_monitor_claim() { new_monitor }; let watchtower = test_utils::TestChainMonitor::new(Some(&chain_source), &bob_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager); - assert_eq!(watchtower.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed); + assert_eq!(watchtower.watch_channel(outpoint, new_monitor), Ok(ChannelMonitorUpdateStatus::Completed)); watchtower }; watchtower_bob.chain_monitor.block_connected(&create_dummy_block(BlockHash::all_zeros(), 42, Vec::new()), HTLC_TIMEOUT_BROADCAST - 1); @@ -8577,7 +8577,7 @@ fn test_concurrent_monitor_claim() { if let ChannelPhase::Funded(ref mut channel) = get_channel_ref!(nodes[0], nodes[1], node_0_per_peer_lock, node_0_peer_state_lock, chan_1.2) { if let Ok(Some(update)) = channel.commitment_signed(&updates.commitment_signed, &node_cfgs[0].logger) { // Watchtower Alice should already have seen the block and reject the update - assert_eq!(watchtower_alice.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::PermanentFailure); + assert_eq!(watchtower_alice.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::InProgress); assert_eq!(watchtower_bob.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::Completed); assert_eq!(nodes[0].chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::Completed); } else { assert!(false); } diff --git a/lightning/src/ln/reload_tests.rs b/lightning/src/ln/reload_tests.rs index c3a428ae0..fa08fba99 100644 --- a/lightning/src/ln/reload_tests.rs +++ b/lightning/src/ln/reload_tests.rs @@ -448,7 +448,7 @@ fn test_manager_serialize_deserialize_inconsistent_monitor() { for monitor in node_0_monitors.drain(..) { assert_eq!(nodes[0].chain_monitor.watch_channel(monitor.get_funding_txo().0, monitor), - ChannelMonitorUpdateStatus::Completed); + Ok(ChannelMonitorUpdateStatus::Completed)); check_added_monitors!(nodes[0], 1); } nodes[0].node = &nodes_0_deserialized; diff --git a/lightning/src/util/persist.rs b/lightning/src/util/persist.rs index 372a094a9..af66e1233 100644 --- a/lightning/src/util/persist.rs +++ b/lightning/src/util/persist.rs @@ -174,8 +174,8 @@ impl<'a, A: KVStore, M: Deref, T: Deref, ES: Deref, NS: Deref, SP: Deref, F: Der impl Persist for K { // TODO: We really need a way for the persister to inform the user that its time to crash/shut // down once these start returning failure. - // A PermanentFailure implies we should probably just shut down the node since we're - // force-closing channels without even broadcasting! + // An InProgress result implies we should probably just shut down the node since we're not + // retrying persistence! fn persist_new_channel(&self, funding_txo: OutPoint, monitor: &ChannelMonitor, _update_id: MonitorUpdateId) -> chain::ChannelMonitorUpdateStatus { let key = format!("{}_{}", funding_txo.txid.to_hex(), funding_txo.index); @@ -185,7 +185,7 @@ impl Persist chain::ChannelMonitorUpdateStatus::Completed, - Err(_) => chain::ChannelMonitorUpdateStatus::PermanentFailure, + Err(_) => chain::ChannelMonitorUpdateStatus::InProgress } } @@ -197,7 +197,7 @@ impl Persist chain::ChannelMonitorUpdateStatus::Completed, - Err(_) => chain::ChannelMonitorUpdateStatus::PermanentFailure, + Err(_) => chain::ChannelMonitorUpdateStatus::InProgress } } } diff --git a/lightning/src/util/test_utils.rs b/lightning/src/util/test_utils.rs index 785b40bef..d53cd39b1 100644 --- a/lightning/src/util/test_utils.rs +++ b/lightning/src/util/test_utils.rs @@ -225,7 +225,7 @@ impl<'a> TestChainMonitor<'a> { } } impl<'a> chain::Watch for TestChainMonitor<'a> { - fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor) -> chain::ChannelMonitorUpdateStatus { + fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor) -> Result { // At every point where we get a monitor update, we should be able to send a useful monitor // to a watchtower and disk... let mut w = TestVecWriter(Vec::new()); -- 2.39.5