Ensure all HTLCs for a claimed payment are claimed on startup
authorMatt Corallo <git@bluematt.me>
Mon, 18 Apr 2022 15:42:11 +0000 (15:42 +0000)
committerMatt Corallo <git@bluematt.me>
Thu, 26 May 2022 00:53:11 +0000 (00:53 +0000)
While the HTLC-claim process happens across all MPP parts under one
lock, this doesn't imply that they are claimed fully atomically on
disk. Ultimately, an application can crash after persisting one
`ChannelMonitorUpdate` out of multiple monitor updates needed for
the full claim.

Previously, this would leave us in a very bad state - because of
the all-channels-available check in `claim_funds` we'd refuse to
claim the payment again on restart (even though the
`PaymentReceived` event will be passed to the user again), and we'd
end up having partially claimed the payment!

The fix for the consistency part of this issue is pretty
straightforward - just check for this condition on startup and
complete the claim across all channels/`ChannelMonitor`s if we
detect it.

This still leaves us in a confused state from the perspective of
the user, however - we've actually claimed a payment but when they
call `claim_funds` we return `false` indicating it could not be
claimed.

lightning/src/chain/channelmonitor.rs
lightning/src/ln/channel.rs
lightning/src/ln/channelmanager.rs
lightning/src/ln/functional_test_utils.rs
lightning/src/ln/functional_tests.rs

index 738fff3837ca045f5c8c9e3e93e588bb88974191..fd66e585208b6b580e6f981d633fa8c0d94c028e 100644 (file)
@@ -1085,7 +1085,8 @@ impl<Signer: Sign> ChannelMonitor<Signer> {
                self.inner.lock().unwrap().provide_latest_holder_commitment_tx(holder_commitment_tx, htlc_outputs).map_err(|_| ())
        }
 
-       #[cfg(test)]
+       /// This is used to provide payment preimage(s) out-of-band during startup without updating the
+       /// off-chain state with a new commitment transaction.
        pub(crate) fn provide_payment_preimage<B: Deref, F: Deref, L: Deref>(
                &self,
                payment_hash: &PaymentHash,
@@ -1631,6 +1632,10 @@ impl<Signer: Sign> ChannelMonitor<Signer> {
 
                res
        }
+
+       pub(crate) fn get_stored_preimages(&self) -> HashMap<PaymentHash, PaymentPreimage> {
+               self.inner.lock().unwrap().payment_preimages.clone()
+       }
 }
 
 /// Compares a broadcasted commitment transaction's HTLCs with those in the latest state,
index 43032c51a3c09cdc67988e96050835fd7a17d9d7..1d204d18a17ed620b24a228d66856175137ceb5a 100644 (file)
@@ -1703,6 +1703,28 @@ impl<Signer: Sign> Channel<Signer> {
                make_funding_redeemscript(&self.get_holder_pubkeys().funding_pubkey, self.counterparty_funding_pubkey())
        }
 
+       /// Claims an HTLC while we're disconnected from a peer, dropping the ChannelMonitorUpdate
+       /// entirely.
+       ///
+       /// The ChannelMonitor for this channel MUST be updated out-of-band with the preimage provided
+       /// (i.e. without calling [`crate::chain::Watch::update_channel`]).
+       ///
+       /// The HTLC claim will end up in the holding cell (because the caller must ensure the peer is
+       /// disconnected).
+       pub fn claim_htlc_while_disconnected_dropping_mon_update<L: Deref>
+               (&mut self, htlc_id_arg: u64, payment_preimage_arg: PaymentPreimage, logger: &L)
+       where L::Target: Logger {
+               // Assert that we'll add the HTLC claim to the holding cell in `get_update_fulfill_htlc`
+               // (see equivalent if condition there).
+               assert!(self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32) != 0);
+               let mon_update_id = self.latest_monitor_update_id; // Forget the ChannelMonitor update
+               let fulfill_resp = self.get_update_fulfill_htlc(htlc_id_arg, payment_preimage_arg, logger);
+               self.latest_monitor_update_id = mon_update_id;
+               if let UpdateFulfillFetch::NewClaim { msg, .. } = fulfill_resp {
+                       assert!(msg.is_none()); // The HTLC must have ended up in the holding cell.
+               }
+       }
+
        fn get_update_fulfill_htlc<L: Deref>(&mut self, htlc_id_arg: u64, payment_preimage_arg: PaymentPreimage, logger: &L) -> UpdateFulfillFetch where L::Target: Logger {
                // Either ChannelFunded got set (which means it won't be unset) or there is no way any
                // caller thought we could have something claimed (cause we wouldn't have accepted in an
@@ -1765,6 +1787,10 @@ impl<Signer: Sign> Channel<Signer> {
                };
 
                if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32)) != 0 {
+                       // Note that this condition is the same as the assertion in
+                       // `claim_htlc_while_disconnected_dropping_mon_update` and must match exactly -
+                       // `claim_htlc_while_disconnected_dropping_mon_update` would not work correctly if we
+                       // do not not get into this branch.
                        for pending_update in self.holding_cell_htlc_updates.iter() {
                                match pending_update {
                                        &HTLCUpdateAwaitingACK::ClaimHTLC { htlc_id, .. } => {
index 71ae6170aa614cd6c5fb672c444d0891b7e03863..a62f44806d45ad617f369b47018538b1c31747c7 100644 (file)
@@ -6698,7 +6698,7 @@ impl<'a, Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref>
                        // payments which are still in-flight via their on-chain state.
                        // We only rebuild the pending payments map if we were most recently serialized by
                        // 0.0.102+
-                       for (_, monitor) in args.channel_monitors {
+                       for (_, monitor) in args.channel_monitors.iter() {
                                if by_id.get(&monitor.get_funding_txo().0.to_channel_id()).is_none() {
                                        for (htlc_source, htlc) in monitor.get_pending_outbound_htlcs() {
                                                if let HTLCSource::OutboundRoute { payment_id, session_priv, path, payment_secret, .. } = htlc_source {
@@ -6824,6 +6824,38 @@ impl<'a, Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref>
                        }
                }
 
+               for (_, monitor) in args.channel_monitors.iter() {
+                       for (payment_hash, payment_preimage) in monitor.get_stored_preimages() {
+                               if let Some(claimable_htlcs) = claimable_htlcs.remove(&payment_hash) {
+                                       log_info!(args.logger, "Re-claimaing HTLCs with payment hash {} due to partial-claim.", log_bytes!(payment_hash.0));
+                                       for claimable_htlc in claimable_htlcs.1 {
+                                               // Add a holding-cell claim of the payment to the Channel, which should be
+                                               // applied ~immediately on peer reconnection. Because it won't generate a
+                                               // new commitment transaction we can just provide the payment preimage to
+                                               // the corresponding ChannelMonitor and nothing else.
+                                               //
+                                               // We do so directly instead of via the normal ChannelMonitor update
+                                               // procedure as the ChainMonitor hasn't yet been initialized, implying
+                                               // we're not allowed to call it directly yet. Further, we do the update
+                                               // without incrementing the ChannelMonitor update ID as there isn't any
+                                               // reason to.
+                                               // If we were to generate a new ChannelMonitor update ID here and then
+                                               // crash before the user finishes block connect we'd end up force-closing
+                                               // this channel as well. On the flip side, there's no harm in restarting
+                                               // without the new monitor persisted - we'll end up right back here on
+                                               // restart.
+                                               let previous_channel_id = claimable_htlc.prev_hop.outpoint.to_channel_id();
+                                               if let Some(channel) = by_id.get_mut(&previous_channel_id) {
+                                                       channel.claim_htlc_while_disconnected_dropping_mon_update(claimable_htlc.prev_hop.htlc_id, payment_preimage, &args.logger);
+                                               }
+                                               if let Some(previous_hop_monitor) = args.channel_monitors.get(&claimable_htlc.prev_hop.outpoint) {
+                                                       previous_hop_monitor.provide_payment_preimage(&payment_hash, &payment_preimage, &args.tx_broadcaster, &args.fee_estimator, &args.logger);
+                                               }
+                                       }
+                               }
+                       }
+               }
+
                let channel_manager = ChannelManager {
                        genesis_hash,
                        fee_estimator: args.fee_estimator,
index c0e33e5b93a51a4d620f6c19eec016a15468f70e..15e75db8979c86bc896557fd114c6e2d8aff837a 100644 (file)
@@ -1476,7 +1476,7 @@ pub fn send_along_route_with_secret<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>,
        payment_id
 }
 
-pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path: &[&Node<'a, 'b, 'c>], recv_value: u64, our_payment_hash: PaymentHash, our_payment_secret: Option<PaymentSecret>, ev: MessageSendEvent, payment_received_expected: bool, expected_preimage: Option<PaymentPreimage>) {
+pub fn do_pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path: &[&Node<'a, 'b, 'c>], recv_value: u64, our_payment_hash: PaymentHash, our_payment_secret: Option<PaymentSecret>, ev: MessageSendEvent, payment_received_expected: bool, clear_recipient_events: bool, expected_preimage: Option<PaymentPreimage>) {
        let mut payment_event = SendEvent::from_event(ev);
        let mut prev_node = origin_node;
 
@@ -1489,7 +1489,7 @@ pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path
 
                expect_pending_htlcs_forwardable!(node);
 
-               if idx == expected_path.len() - 1 {
+               if idx == expected_path.len() - 1 && clear_recipient_events {
                        let events_2 = node.node.get_and_clear_pending_events();
                        if payment_received_expected {
                                assert_eq!(events_2.len(), 1);
@@ -1513,7 +1513,7 @@ pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path
                        } else {
                                assert!(events_2.is_empty());
                        }
-               } else {
+               } else if idx != expected_path.len() - 1 {
                        let mut events_2 = node.node.get_and_clear_pending_msg_events();
                        assert_eq!(events_2.len(), 1);
                        check_added_monitors!(node, 1);
@@ -1525,6 +1525,10 @@ pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path
        }
 }
 
+pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path: &[&Node<'a, 'b, 'c>], recv_value: u64, our_payment_hash: PaymentHash, our_payment_secret: Option<PaymentSecret>, ev: MessageSendEvent, payment_received_expected: bool, expected_preimage: Option<PaymentPreimage>) {
+       do_pass_along_path(origin_node, expected_path, recv_value, our_payment_hash, our_payment_secret, ev, payment_received_expected, true, expected_preimage);
+}
+
 pub fn pass_along_route<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_route: &[&[&Node<'a, 'b, 'c>]], recv_value: u64, our_payment_hash: PaymentHash, our_payment_secret: PaymentSecret) {
        let mut events = origin_node.node.get_and_clear_pending_msg_events();
        assert_eq!(events.len(), expected_route.len());
index 48b4b07c7d76d1f73238b27eb851ac32e720adbb..e840bef4aac65e0b08edead14a8ac2a5d6447e11 100644 (file)
@@ -9843,6 +9843,186 @@ fn test_keysend_payments_to_private_node() {
        claim_payment(&nodes[0], &path, test_preimage);
 }
 
+fn do_test_partial_claim_before_restart(persist_both_monitors: bool) {
+       // Test what happens if a node receives an MPP payment, claims it, but crashes before
+       // persisting the ChannelManager. If `persist_both_monitors` is false, also crash after only
+       // updating one of the two channels' ChannelMonitors. As a result, on startup, we'll (a) still
+       // have the PaymentReceived event, (b) have one (or two) channel(s) that goes on chain with the
+       // HTLC preimage in them, and (c) optionally have one channel that is live off-chain but does
+       // not have the preimage tied to the still-pending HTLC.
+       //
+       // To get to the correct state, on startup we should propagate the preimage to the
+       // still-off-chain channel, claiming the HTLC as soon as the peer connects, with the monitor
+       // receiving the preimage without a state update.
+       let chanmon_cfgs = create_chanmon_cfgs(4);
+       let node_cfgs = create_node_cfgs(4, &chanmon_cfgs);
+       let node_chanmgrs = create_node_chanmgrs(4, &node_cfgs, &[None, None, None, None]);
+
+       let persister: test_utils::TestPersister;
+       let new_chain_monitor: test_utils::TestChainMonitor;
+       let nodes_3_deserialized: ChannelManager<EnforcingSigner, &test_utils::TestChainMonitor, &test_utils::TestBroadcaster, &test_utils::TestKeysInterface, &test_utils::TestFeeEstimator, &test_utils::TestLogger>;
+
+       let mut nodes = create_network(4, &node_cfgs, &node_chanmgrs);
+
+       create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 100_000, 0, InitFeatures::known(), InitFeatures::known());
+       create_announced_chan_between_nodes_with_value(&nodes, 0, 2, 100_000, 0, InitFeatures::known(), InitFeatures::known());
+       let chan_id_persisted = create_announced_chan_between_nodes_with_value(&nodes, 1, 3, 100_000, 0, InitFeatures::known(), InitFeatures::known()).2;
+       let chan_id_not_persisted = create_announced_chan_between_nodes_with_value(&nodes, 2, 3, 100_000, 0, InitFeatures::known(), InitFeatures::known()).2;
+
+       // Create an MPP route for 15k sats, more than the default htlc-max of 10%
+       let (mut route, payment_hash, payment_preimage, payment_secret) = get_route_and_payment_hash!(nodes[0], nodes[3], 15_000_000);
+       assert_eq!(route.paths.len(), 2);
+       route.paths.sort_by(|path_a, _| {
+               // Sort the path so that the path through nodes[1] comes first
+               if path_a[0].pubkey == nodes[1].node.get_our_node_id() {
+                       core::cmp::Ordering::Less } else { core::cmp::Ordering::Greater }
+       });
+
+       nodes[0].node.send_payment(&route, payment_hash, &Some(payment_secret)).unwrap();
+       check_added_monitors!(nodes[0], 2);
+
+       // Send the payment through to nodes[3] *without* clearing the PaymentReceived event
+       let mut send_events = nodes[0].node.get_and_clear_pending_msg_events();
+       assert_eq!(send_events.len(), 2);
+       do_pass_along_path(&nodes[0], &[&nodes[1], &nodes[3]], 15_000_000, payment_hash, Some(payment_secret), send_events[0].clone(), true, false, None);
+       do_pass_along_path(&nodes[0], &[&nodes[2], &nodes[3]], 15_000_000, payment_hash, Some(payment_secret), send_events[1].clone(), true, false, None);
+
+       // Now that we have an MPP payment pending, get the latest encoded copies of nodes[3]'s
+       // monitors and ChannelManager, for use later, if we don't want to persist both monitors.
+       let mut original_monitor = test_utils::TestVecWriter(Vec::new());
+       if !persist_both_monitors {
+               for outpoint in nodes[3].chain_monitor.chain_monitor.list_monitors() {
+                       if outpoint.to_channel_id() == chan_id_not_persisted {
+                               assert!(original_monitor.0.is_empty());
+                               nodes[3].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap().write(&mut original_monitor).unwrap();
+                       }
+               }
+       }
+
+       let mut original_manager = test_utils::TestVecWriter(Vec::new());
+       nodes[3].node.write(&mut original_manager).unwrap();
+
+       expect_payment_received!(nodes[3], payment_hash, payment_secret, 15_000_000);
+
+       nodes[3].node.claim_funds(payment_preimage);
+       check_added_monitors!(nodes[3], 2);
+
+       // Now fetch one of the two updated ChannelMonitors from nodes[3], and restart pretending we
+       // crashed in between the two persistence calls - using one old ChannelMonitor and one new one,
+       // with the old ChannelManager.
+       let mut updated_monitor = test_utils::TestVecWriter(Vec::new());
+       for outpoint in nodes[3].chain_monitor.chain_monitor.list_monitors() {
+               if outpoint.to_channel_id() == chan_id_persisted {
+                       assert!(updated_monitor.0.is_empty());
+                       nodes[3].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap().write(&mut updated_monitor).unwrap();
+               }
+       }
+       // If `persist_both_monitors` is set, get the second monitor here as well
+       if persist_both_monitors {
+               for outpoint in nodes[3].chain_monitor.chain_monitor.list_monitors() {
+                       if outpoint.to_channel_id() == chan_id_not_persisted {
+                               assert!(original_monitor.0.is_empty());
+                               nodes[3].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap().write(&mut original_monitor).unwrap();
+                       }
+               }
+       }
+
+       // Now restart nodes[3].
+       persister = test_utils::TestPersister::new();
+       let keys_manager = &chanmon_cfgs[3].keys_manager;
+       new_chain_monitor = test_utils::TestChainMonitor::new(Some(nodes[3].chain_source), nodes[3].tx_broadcaster.clone(), nodes[3].logger, node_cfgs[3].fee_estimator, &persister, keys_manager);
+       nodes[3].chain_monitor = &new_chain_monitor;
+       let mut monitors = Vec::new();
+       for mut monitor_data in [original_monitor, updated_monitor].iter() {
+               let (_, mut deserialized_monitor) = <(BlockHash, ChannelMonitor<EnforcingSigner>)>::read(&mut &monitor_data.0[..], keys_manager).unwrap();
+               monitors.push(deserialized_monitor);
+       }
+
+       let config = UserConfig::default();
+       nodes_3_deserialized = {
+               let mut channel_monitors = HashMap::new();
+               for monitor in monitors.iter_mut() {
+                       channel_monitors.insert(monitor.get_funding_txo().0, monitor);
+               }
+               <(BlockHash, ChannelManager<EnforcingSigner, &test_utils::TestChainMonitor, &test_utils::TestBroadcaster, &test_utils::TestKeysInterface, &test_utils::TestFeeEstimator, &test_utils::TestLogger>)>::read(&mut &original_manager.0[..], ChannelManagerReadArgs {
+                       default_config: config,
+                       keys_manager,
+                       fee_estimator: node_cfgs[3].fee_estimator,
+                       chain_monitor: nodes[3].chain_monitor,
+                       tx_broadcaster: nodes[3].tx_broadcaster.clone(),
+                       logger: nodes[3].logger,
+                       channel_monitors,
+               }).unwrap().1
+       };
+       nodes[3].node = &nodes_3_deserialized;
+
+       for monitor in monitors {
+               // On startup the preimage should have been copied into the non-persisted monitor:
+               assert!(monitor.get_stored_preimages().contains_key(&payment_hash));
+               nodes[3].chain_monitor.watch_channel(monitor.get_funding_txo().0.clone(), monitor).unwrap();
+       }
+       check_added_monitors!(nodes[3], 2);
+
+       nodes[1].node.peer_disconnected(&nodes[3].node.get_our_node_id(), false);
+       nodes[2].node.peer_disconnected(&nodes[3].node.get_our_node_id(), false);
+
+       // During deserialization, we should have closed one channel and broadcast its latest
+       // commitment transaction. We should also still have the original PaymentReceived event we
+       // never finished processing.
+       let events = nodes[3].node.get_and_clear_pending_events();
+       assert_eq!(events.len(), if persist_both_monitors { 3 } else { 2 });
+       if let Event::PaymentReceived { amt: 15_000_000, .. } = events[0] { } else { panic!(); }
+       if let Event::ChannelClosed { reason: ClosureReason::OutdatedChannelManager, .. } = events[1] { } else { panic!(); }
+       if persist_both_monitors {
+               if let Event::ChannelClosed { reason: ClosureReason::OutdatedChannelManager, .. } = events[2] { } else { panic!(); }
+       }
+
+       assert_eq!(nodes[3].node.list_channels().len(), if persist_both_monitors { 0 } else { 1 });
+       if !persist_both_monitors {
+               // If one of the two channels is still live, reveal the payment preimage over it.
+
+               nodes[3].node.peer_connected(&nodes[2].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
+               let reestablish_1 = get_chan_reestablish_msgs!(nodes[3], nodes[2]);
+               nodes[2].node.peer_connected(&nodes[3].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
+               let reestablish_2 = get_chan_reestablish_msgs!(nodes[2], nodes[3]);
+
+               nodes[2].node.handle_channel_reestablish(&nodes[3].node.get_our_node_id(), &reestablish_1[0]);
+               get_event_msg!(nodes[2], MessageSendEvent::SendChannelUpdate, nodes[3].node.get_our_node_id());
+               assert!(nodes[2].node.get_and_clear_pending_msg_events().is_empty());
+
+               nodes[3].node.handle_channel_reestablish(&nodes[2].node.get_our_node_id(), &reestablish_2[0]);
+
+               // Once we call `get_and_clear_pending_msg_events` the holding cell is cleared and the HTLC
+               // claim should fly.
+               let ds_msgs = nodes[3].node.get_and_clear_pending_msg_events();
+               check_added_monitors!(nodes[3], 1);
+               assert_eq!(ds_msgs.len(), 2);
+               if let MessageSendEvent::SendChannelUpdate { .. } = ds_msgs[1] {} else { panic!(); }
+
+               let cs_updates = match ds_msgs[0] {
+                       MessageSendEvent::UpdateHTLCs { ref updates, .. } => {
+                               nodes[2].node.handle_update_fulfill_htlc(&nodes[3].node.get_our_node_id(), &updates.update_fulfill_htlcs[0]);
+                               check_added_monitors!(nodes[2], 1);
+                               let cs_updates = get_htlc_update_msgs!(nodes[2], nodes[0].node.get_our_node_id());
+                               expect_payment_forwarded!(nodes[2], nodes[0], nodes[3], Some(1000), false, false);
+                               commitment_signed_dance!(nodes[2], nodes[3], updates.commitment_signed, false, true);
+                               cs_updates
+                       }
+                       _ => panic!(),
+               };
+
+               nodes[0].node.handle_update_fulfill_htlc(&nodes[2].node.get_our_node_id(), &cs_updates.update_fulfill_htlcs[0]);
+               commitment_signed_dance!(nodes[0], nodes[2], cs_updates.commitment_signed, false, true);
+               expect_payment_sent!(nodes[0], payment_preimage);
+       }
+}
+
+#[test]
+fn test_partial_claim_before_restart() {
+       do_test_partial_claim_before_restart(false);
+       do_test_partial_claim_before_restart(true);
+}
+
 /// The possible events which may trigger a `max_dust_htlc_exposure` breach
 #[derive(Clone, Copy, PartialEq)]
 enum ExposureEvent {