Disconnect peers on timer ticks to unblock channel state machine
authorWilmer Paulino <wilmer@wilmerpaulino.com>
Sat, 13 May 2023 01:39:18 +0000 (18:39 -0700)
committerWilmer Paulino <wilmer@wilmerpaulino.com>
Fri, 26 May 2023 21:40:19 +0000 (14:40 -0700)
At times, we've noticed that channels with `lnd` counterparties do not
receive messages we expect to in a timely manner (or at all) after
sending them a `ChannelReestablish` upon reconnection, or a
`CommitmentSigned` message. This can block the channel state machine
from making progress, eventually leading to force closes, if any pending
HTLCs are committed and their expiration is met.

It seems common wisdom for `lnd` node operators to periodically restart
their node/reconnect to their peers, allowing them to start from a fresh
state such that the message we expect to receive hopefully gets sent. We
can achieve the same end result by disconnecting peers ourselves
(regardless of whether they're a `lnd` node), which we opt to implement
here by awaiting their response within two timer ticks.

lightning/src/ln/channel.rs
lightning/src/ln/channelmanager.rs
lightning/src/ln/functional_tests.rs

index 11f0261d677a3df37b2fa36f94ea142e5ce8f16c..d1412b2aeb1c04aaf0cbb60e2da960bc8352cedb 100644 (file)
@@ -479,6 +479,13 @@ pub(crate) const MIN_AFFORDABLE_HTLC_COUNT: usize = 4;
 ///   * `EXPIRE_PREV_CONFIG_TICKS` = convergence_delay / tick_interval
 pub(crate) const EXPIRE_PREV_CONFIG_TICKS: usize = 5;
 
+/// The number of ticks that may elapse while we're waiting for a response to a
+/// [`msgs::RevokeAndACK`] or [`msgs::ChannelReestablish`] message before we attempt to disconnect
+/// them.
+///
+/// See [`Channel::sent_message_awaiting_response`] for more information.
+pub(crate) const DISCONNECT_PEER_AWAITING_RESPONSE_TICKS: usize = 2;
+
 struct PendingChannelMonitorUpdate {
        update: ChannelMonitorUpdate,
        /// In some cases we need to delay letting the [`ChannelMonitorUpdate`] go until after an
@@ -715,6 +722,19 @@ pub(super) struct Channel<Signer: ChannelSigner> {
        /// See-also <https://github.com/lightningnetwork/lnd/issues/4006>
        pub workaround_lnd_bug_4006: Option<msgs::ChannelReady>,
 
+       /// An option set when we wish to track how many ticks have elapsed while waiting for a response
+       /// from our counterparty after sending a message. If the peer has yet to respond after reaching
+       /// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`, a reconnection should be attempted to try to
+       /// unblock the state machine.
+       ///
+       /// This behavior is mostly motivated by a lnd bug in which we don't receive a message we expect
+       /// to in a timely manner, which may lead to channels becoming unusable and/or force-closed. An
+       /// example of such can be found at <https://github.com/lightningnetwork/lnd/issues/7682>.
+       ///
+       /// This is currently only used when waiting for a [`msgs::ChannelReestablish`] or
+       /// [`msgs::RevokeAndACK`] message from the counterparty.
+       sent_message_awaiting_response: Option<usize>,
+
        #[cfg(any(test, fuzzing))]
        // When we receive an HTLC fulfill on an outbound path, we may immediately fulfill the
        // corresponding HTLC on the inbound path. If, then, the outbound path channel is
@@ -1130,6 +1150,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
                        next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
 
                        workaround_lnd_bug_4006: None,
+                       sent_message_awaiting_response: None,
 
                        latest_inbound_scid_alias: None,
                        outbound_scid_alias,
@@ -1489,6 +1510,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
                        next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
 
                        workaround_lnd_bug_4006: None,
+                       sent_message_awaiting_response: None,
 
                        latest_inbound_scid_alias: None,
                        outbound_scid_alias,
@@ -3526,6 +3548,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
                // OK, we step the channel here and *then* if the new generation fails we can fail the
                // channel based on that, but stepping stuff here should be safe either way.
                self.channel_state &= !(ChannelState::AwaitingRemoteRevoke as u32);
+               self.sent_message_awaiting_response = None;
                self.counterparty_prev_commitment_point = self.counterparty_cur_commitment_point;
                self.counterparty_cur_commitment_point = Some(msg.next_per_commitment_point);
                self.cur_counterparty_commitment_transaction_number -= 1;
@@ -3841,6 +3864,8 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
                        }
                }
 
+               self.sent_message_awaiting_response = None;
+
                self.channel_state |= ChannelState::PeerDisconnected as u32;
                log_trace!(logger, "Peer disconnection resulted in {} remote-announced HTLC drops on channel {}", inbound_drop_count, log_bytes!(self.channel_id()));
        }
@@ -3943,6 +3968,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
                        Some(self.get_last_revoke_and_ack())
                } else { None };
                let commitment_update = if self.monitor_pending_commitment_signed {
+                       self.mark_awaiting_response();
                        Some(self.get_last_commitment_update(logger))
                } else { None };
 
@@ -4132,6 +4158,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
                // Go ahead and unmark PeerDisconnected as various calls we may make check for it (and all
                // remaining cases either succeed or ErrorMessage-fail).
                self.channel_state &= !(ChannelState::PeerDisconnected as u32);
+               self.sent_message_awaiting_response = None;
 
                let shutdown_msg = if self.channel_state & (ChannelState::LocalShutdownSent as u32) != 0 {
                        assert!(self.shutdown_scriptpubkey.is_some());
@@ -4192,7 +4219,11 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
                // revoke_and_ack, not on sending commitment_signed, so we add one if have
                // AwaitingRemoteRevoke set, which indicates we sent a commitment_signed but haven't gotten
                // the corresponding revoke_and_ack back yet.
-               let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if (self.channel_state & ChannelState::AwaitingRemoteRevoke as u32) != 0 { 1 } else { 0 };
+               let is_awaiting_remote_revoke = self.channel_state & ChannelState::AwaitingRemoteRevoke as u32 != 0;
+               if is_awaiting_remote_revoke && !self.is_awaiting_monitor_update() {
+                       self.mark_awaiting_response();
+               }
+               let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if is_awaiting_remote_revoke { 1 } else { 0 };
 
                let channel_ready = if msg.next_local_commitment_number == 1 && INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number == 1 {
                        // We should never have to worry about MonitorUpdateInProgress resending ChannelReady
@@ -4361,6 +4392,28 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
                }), None))
        }
 
+       // Marks a channel as waiting for a response from the counterparty. If it's not received
+       // [`DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`] after sending our own to them, then we'll attempt
+       // a reconnection.
+       fn mark_awaiting_response(&mut self) {
+               self.sent_message_awaiting_response = Some(0);
+       }
+
+       /// Determines whether we should disconnect the counterparty due to not receiving a response
+       /// within our expected timeframe.
+       ///
+       /// This should be called on every [`super::channelmanager::ChannelManager::timer_tick_occurred`].
+       pub fn should_disconnect_peer_awaiting_response(&mut self) -> bool {
+               let ticks_elapsed = if let Some(ticks_elapsed) = self.sent_message_awaiting_response.as_mut() {
+                       ticks_elapsed
+               } else {
+                       // Don't disconnect when we're not waiting on a response.
+                       return false;
+               };
+               *ticks_elapsed += 1;
+               *ticks_elapsed >= DISCONNECT_PEER_AWAITING_RESPONSE_TICKS
+       }
+
        pub fn shutdown<SP: Deref>(
                &mut self, signer_provider: &SP, their_features: &InitFeatures, msg: &msgs::Shutdown
        ) -> Result<(Option<msgs::Shutdown>, Option<&ChannelMonitorUpdate>, Vec<(HTLCSource, PaymentHash)>), ChannelError>
@@ -5733,7 +5786,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
 
        /// May panic if called on a channel that wasn't immediately-previously
        /// self.remove_uncommitted_htlcs_and_mark_paused()'d
-       pub fn get_channel_reestablish<L: Deref>(&self, logger: &L) -> msgs::ChannelReestablish where L::Target: Logger {
+       pub fn get_channel_reestablish<L: Deref>(&mut self, logger: &L) -> msgs::ChannelReestablish where L::Target: Logger {
                assert_eq!(self.channel_state & ChannelState::PeerDisconnected as u32, ChannelState::PeerDisconnected as u32);
                assert_ne!(self.cur_counterparty_commitment_transaction_number, INITIAL_COMMITMENT_NUMBER);
                // Prior to static_remotekey, my_current_per_commitment_point was critical to claiming
@@ -5752,6 +5805,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
                        log_info!(logger, "Sending a data_loss_protect with no previous remote per_commitment_secret for channel {}", log_bytes!(self.channel_id()));
                        [0;32]
                };
+               self.mark_awaiting_response();
                msgs::ChannelReestablish {
                        channel_id: self.channel_id(),
                        // The protocol has two different commitment number concepts - the "commitment
@@ -7090,6 +7144,7 @@ impl<'a, 'b, 'c, ES: Deref, SP: Deref> ReadableArgs<(&'a ES, &'b SP, u32, &'c Ch
                        next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
 
                        workaround_lnd_bug_4006: None,
+                       sent_message_awaiting_response: None,
 
                        latest_inbound_scid_alias,
                        // Later in the ChannelManager deserialization phase we scan for channels and assign scid aliases if its missing
index 30eab9066fd0b4190ef65167d8e82c98c51c9be1..f489003029afd847f5314fb3dba4e2a721f57c4b 100644 (file)
@@ -3921,6 +3921,20 @@ where
 
                                                chan.maybe_expire_prev_config();
 
+                                               if chan.should_disconnect_peer_awaiting_response() {
+                                                       log_debug!(self.logger, "Disconnecting peer {} due to not making any progress on channel {}",
+                                                                       counterparty_node_id, log_bytes!(*chan_id));
+                                                       pending_msg_events.push(MessageSendEvent::HandleError {
+                                                               node_id: counterparty_node_id,
+                                                               action: msgs::ErrorAction::DisconnectPeerWithWarning {
+                                                                       msg: msgs::WarningMessage {
+                                                                               channel_id: *chan_id,
+                                                                               data: "Disconnecting due to timeout awaiting response".to_owned(),
+                                                                       },
+                                                               },
+                                                       });
+                                               }
+
                                                true
                                        });
                                        if peer_state.ok_to_remove(true) {
index 98fc4bd95b6d58f100ae09aaa32519e57237598c..508c540dff756a01a8fe76946cc9e30632b3d1ca 100644 (file)
@@ -22,7 +22,7 @@ use crate::events::{Event, MessageSendEvent, MessageSendEventsProvider, PathFail
 use crate::ln::{PaymentPreimage, PaymentSecret, PaymentHash};
 use crate::ln::channel::{commitment_tx_base_weight, COMMITMENT_TX_WEIGHT_PER_HTLC, CONCURRENT_INBOUND_HTLC_FEE_BUFFER, FEE_SPIKE_BUFFER_FEE_INCREASE_MULTIPLE, MIN_AFFORDABLE_HTLC_COUNT};
 use crate::ln::channelmanager::{self, PaymentId, RAACommitmentOrder, PaymentSendFailure, RecipientOnionFields, BREAKDOWN_TIMEOUT, ENABLE_GOSSIP_TICKS, DISABLE_GOSSIP_TICKS, MIN_CLTV_EXPIRY_DELTA};
-use crate::ln::channel::{Channel, ChannelError};
+use crate::ln::channel::{DISCONNECT_PEER_AWAITING_RESPONSE_TICKS, Channel, ChannelError};
 use crate::ln::{chan_utils, onion_utils};
 use crate::ln::chan_utils::{OFFERED_HTLC_SCRIPT_WEIGHT, htlc_success_tx_weight, htlc_timeout_tx_weight, HTLCOutputInCommitment};
 use crate::routing::gossip::{NetworkGraph, NetworkUpdate};
@@ -9955,3 +9955,128 @@ fn test_payment_with_custom_min_cltv_expiry_delta() {
        do_payment_with_custom_min_final_cltv_expiry(true, false);
        do_payment_with_custom_min_final_cltv_expiry(true, true);
 }
+
+#[test]
+fn test_disconnects_peer_awaiting_response_ticks() {
+       // Tests that nodes which are awaiting on a response critical for channel responsiveness
+       // disconnect their counterparty after `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
+       let mut chanmon_cfgs = create_chanmon_cfgs(2);
+       let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
+       let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
+       let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+
+       // Asserts a disconnect event is queued to the user.
+       let check_disconnect_event = |node: &Node, should_disconnect: bool| {
+               let disconnect_event = node.node.get_and_clear_pending_msg_events().iter().find_map(|event|
+                       if let MessageSendEvent::HandleError { action, .. } = event {
+                               if let msgs::ErrorAction::DisconnectPeerWithWarning { .. } = action {
+                                       Some(())
+                               } else {
+                                       None
+                               }
+                       } else {
+                               None
+                       }
+               );
+               assert_eq!(disconnect_event.is_some(), should_disconnect);
+       };
+
+       // Fires timer ticks ensuring we only attempt to disconnect peers after reaching
+       // `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
+       let check_disconnect = |node: &Node| {
+               // No disconnect without any timer ticks.
+               check_disconnect_event(node, false);
+
+               // No disconnect with 1 timer tick less than required.
+               for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS - 1 {
+                       node.node.timer_tick_occurred();
+                       check_disconnect_event(node, false);
+               }
+
+               // Disconnect after reaching the required ticks.
+               node.node.timer_tick_occurred();
+               check_disconnect_event(node, true);
+
+               // Disconnect again on the next tick if the peer hasn't been disconnected yet.
+               node.node.timer_tick_occurred();
+               check_disconnect_event(node, true);
+       };
+
+       create_chan_between_nodes(&nodes[0], &nodes[1]);
+
+       // We'll start by performing a fee update with Alice (nodes[0]) on the channel.
+       *nodes[0].fee_estimator.sat_per_kw.lock().unwrap() *= 2;
+       nodes[0].node.timer_tick_occurred();
+       check_added_monitors!(&nodes[0], 1);
+       let alice_fee_update = get_htlc_update_msgs(&nodes[0], &nodes[1].node.get_our_node_id());
+       nodes[1].node.handle_update_fee(&nodes[0].node.get_our_node_id(), alice_fee_update.update_fee.as_ref().unwrap());
+       nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &alice_fee_update.commitment_signed);
+       check_added_monitors!(&nodes[1], 1);
+
+       // This will prompt Bob (nodes[1]) to respond with his `CommitmentSigned` and `RevokeAndACK`.
+       let (bob_revoke_and_ack, bob_commitment_signed) = get_revoke_commit_msgs!(&nodes[1], nodes[0].node.get_our_node_id());
+       nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bob_revoke_and_ack);
+       check_added_monitors!(&nodes[0], 1);
+       nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bob_commitment_signed);
+       check_added_monitors(&nodes[0], 1);
+
+       // Alice then needs to send her final `RevokeAndACK` to complete the commitment dance. We
+       // pretend Bob hasn't received the message and check whether he'll disconnect Alice after
+       // reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
+       let alice_revoke_and_ack = get_event_msg!(nodes[0], MessageSendEvent::SendRevokeAndACK, nodes[1].node.get_our_node_id());
+       check_disconnect(&nodes[1]);
+
+       // Now, we'll reconnect them to test awaiting a `ChannelReestablish` message.
+       //
+       // Note that since the commitment dance didn't complete above, Alice is expected to resend her
+       // final `RevokeAndACK` to Bob to complete it.
+       nodes[0].node.peer_disconnected(&nodes[1].node.get_our_node_id());
+       nodes[1].node.peer_disconnected(&nodes[0].node.get_our_node_id());
+       let bob_init = msgs::Init { features: nodes[1].node.init_features(), remote_network_address: None };
+       nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &bob_init, true).unwrap();
+       let alice_init = msgs::Init { features: nodes[0].node.init_features(), remote_network_address: None };
+       nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &alice_init, true).unwrap();
+
+       // Upon reconnection, Alice sends her `ChannelReestablish` to Bob. Alice, however, hasn't
+       // received Bob's yet, so she should disconnect him after reaching
+       // `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
+       let alice_channel_reestablish = get_event_msg!(
+               nodes[0], MessageSendEvent::SendChannelReestablish, nodes[1].node.get_our_node_id()
+       );
+       nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &alice_channel_reestablish);
+       check_disconnect(&nodes[0]);
+
+       // Bob now sends his `ChannelReestablish` to Alice to resume the channel and consider it "live".
+       let bob_channel_reestablish = nodes[1].node.get_and_clear_pending_msg_events().iter().find_map(|event|
+               if let MessageSendEvent::SendChannelReestablish { node_id, msg } = event {
+                       assert_eq!(*node_id, nodes[0].node.get_our_node_id());
+                       Some(msg.clone())
+               } else {
+                       None
+               }
+       ).unwrap();
+       nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &bob_channel_reestablish);
+
+       // Sanity check that Alice won't disconnect Bob since she's no longer waiting for any messages.
+       for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
+               nodes[0].node.timer_tick_occurred();
+               check_disconnect_event(&nodes[0], false);
+       }
+
+       // However, Bob is still waiting on Alice's `RevokeAndACK`, so he should disconnect her after
+       // reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
+       check_disconnect(&nodes[1]);
+
+       // Finally, have Bob process the last message.
+       nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &alice_revoke_and_ack);
+       check_added_monitors(&nodes[1], 1);
+
+       // At this point, neither node should attempt to disconnect each other, since they aren't
+       // waiting on any messages.
+       for node in &nodes {
+               for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
+                       node.node.timer_tick_occurred();
+                       check_disconnect_event(node, false);
+               }
+       }
+}