From 920d96edb6595289902f287419de2d002e2dc2ee Mon Sep 17 00:00:00 2001
From: Matt Corallo <git@bluematt.me>
Date: Tue, 11 Jun 2024 01:39:48 +0000
Subject: [PATCH] Unblock channels awaiting monitor update based on `ChanMan`
 queue

When we have `ChannelMonitorUpdate`s which are completing both
synchronously and asynchronously, we need to consider a channel as
unblocked based on the `ChannelManager` monitor update queue,
rather than by checking the `update_id`s.

Consider the case where a channel is updated, leading to a
`ChannelMonitorUpdate` which completes asynchronously. The update
completes, but prior to the `ChannelManager` receiving the
`MonitorEvent::Completed` it generates a further
`ChannelMonitorUpdate`. This second update completes synchronously.
As a result, when the `MonitorEvent` is processed, the event's
`monitor_update_id` is the first update, but there are no updates
queued and the channel should be free to return to be unblocked.

Here we fix this by looking only at the `ChannelManager` update
queue, rather than the update_id of the `MonitorEvent`.

While we don't anticipate many users having both synchronous and
asynchronous persists in the same application, there isn't much
cost to supporting it, which we do here.

Found by the chanmon_consistency target.
---
 lightning/src/ln/chanmon_update_fail_tests.rs | 87 +++++++++++++++++++
 lightning/src/ln/channelmanager.rs            |  2 +-
 2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs
index b33e19620..5b821bfb8 100644
--- a/lightning/src/ln/chanmon_update_fail_tests.rs
+++ b/lightning/src/ln/chanmon_update_fail_tests.rs
@@ -3337,6 +3337,93 @@ fn test_durable_preimages_on_closed_channel() {
 	do_test_durable_preimages_on_closed_channel(false, false, false);
 }
 
+#[test]
+fn test_sync_async_persist_doesnt_hang() {
+	// Previously, we checked if a channel was a candidate for making forward progress based on if
+	// the `MonitorEvent::Completed` matched the channel's latest monitor update id. However, this
+	// could lead to a rare race when `ChannelMonitor`s were being persisted both synchronously and
+	// asynchronously leading to channel hangs.
+	//
+	// To hit this case, we need to generate a `MonitorEvent::Completed` prior to a new channel
+	// update, but which is only processed after the channel update.
+	let chanmon_cfgs = create_chanmon_cfgs(2);
+	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
+	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
+	let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+
+	let chan_id_ab = create_announced_chan_between_nodes(&nodes, 0, 1).2;
+
+	// Send two payments from A to B, then claim the first, marking the very last
+	// ChannelMonitorUpdate as InProgress...
+	let (payment_preimage_1, payment_hash_1, ..) = route_payment(&nodes[0], &[&nodes[1]], 1_000_000);
+	let (payment_preimage_2, payment_hash_2, ..) = route_payment(&nodes[0], &[&nodes[1]], 1_000_000);
+
+	nodes[1].node.claim_funds(payment_preimage_1);
+	check_added_monitors(&nodes[1], 1);
+	expect_payment_claimed!(nodes[1], payment_hash_1, 1_000_000);
+
+	let bs_updates = get_htlc_update_msgs(&nodes[1], &nodes[0].node.get_our_node_id());
+	nodes[0].node.handle_update_fulfill_htlc(&nodes[1].node.get_our_node_id(), &bs_updates.update_fulfill_htlcs[0]);
+	expect_payment_sent(&nodes[0], payment_preimage_1, None, false, false);
+	nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bs_updates.commitment_signed);
+	check_added_monitors(&nodes[0], 1);
+	let (as_raa, as_cs) = get_revoke_commit_msgs!(nodes[0], nodes[1].node.get_our_node_id());
+
+	nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &as_raa);
+	check_added_monitors(&nodes[1], 1);
+	nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &as_cs);
+	check_added_monitors(&nodes[1], 1);
+
+	let bs_final_raa = get_event_msg!(nodes[1], MessageSendEvent::SendRevokeAndACK, nodes[0].node.get_our_node_id());
+	chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
+	nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bs_final_raa);
+	check_added_monitors(&nodes[0], 1);
+
+	// Immediately complete the monitor update, but before the ChannelManager has a chance to see
+	// the MonitorEvent::Completed, create a channel update by receiving a claim on the second
+	// payment.
+	let (outpoint, _, ab_update_id) = nodes[0].chain_monitor.latest_monitor_update_id.lock().unwrap().get(&chan_id_ab).unwrap().clone();
+	nodes[0].chain_monitor.chain_monitor.channel_monitor_updated(outpoint, ab_update_id).unwrap();
+
+	nodes[1].node.claim_funds(payment_preimage_2);
+	check_added_monitors(&nodes[1], 1);
+	expect_payment_claimed!(nodes[1], payment_hash_2, 1_000_000);
+
+	let bs_updates = get_htlc_update_msgs(&nodes[1], &nodes[0].node.get_our_node_id());
+	nodes[0].node.handle_update_fulfill_htlc(&nodes[1].node.get_our_node_id(), &bs_updates.update_fulfill_htlcs[0]);
+	nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bs_updates.commitment_signed);
+	check_added_monitors(&nodes[0], 1);
+
+	// At this point, we have completed an extra `ChannelMonitorUpdate` but the `ChannelManager`
+	// hasn't yet seen our `MonitorEvent::Completed`. When we call
+	// `get_and_clear_pending_msg_events` here, the `ChannelManager` finally sees that event and
+	// should return the channel to normal operation.
+	let (as_raa, as_cs) = get_revoke_commit_msgs!(nodes[0], nodes[1].node.get_our_node_id());
+
+	// Now that we've completed our test, process the events we have queued up (which we were not
+	// able to check until now as they would have caused the `ChannelManager` to look at the
+	// pending `MonitorEvent`s).
+	let pending_events = nodes[0].node.get_and_clear_pending_events();
+	assert_eq!(pending_events.len(), 2);
+	if let Event::PaymentPathSuccessful { ref payment_hash, ..} = pending_events[1] {
+		assert_eq!(payment_hash.unwrap(), payment_hash_1);
+	} else { panic!(); }
+	if let Event::PaymentSent { ref payment_hash, ..} = pending_events[0] {
+		assert_eq!(*payment_hash, payment_hash_2);
+	} else { panic!(); }
+
+	// Finally, complete the claiming of the second payment
+	nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &as_raa);
+	check_added_monitors(&nodes[1], 1);
+	nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &as_cs);
+	check_added_monitors(&nodes[1], 1);
+
+	let bs_final_raa = get_event_msg!(nodes[1], MessageSendEvent::SendRevokeAndACK, nodes[0].node.get_our_node_id());
+	nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bs_final_raa);
+	check_added_monitors(&nodes[0], 1);
+	expect_payment_path_successful!(nodes[0]);
+}
+
 fn do_test_reload_mon_update_completion_actions(close_during_reload: bool) {
 	// Test that if a `ChannelMonitorUpdate` completes but a `ChannelManager` isn't serialized
 	// before restart we run the monitor update completion action on startup.
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
index 992fd3c6f..a106537ff 100644
--- a/lightning/src/ln/channelmanager.rs
+++ b/lightning/src/ln/channelmanager.rs
@@ -6613,7 +6613,7 @@ where
 		log_trace!(logger, "ChannelMonitor updated to {}. Current highest is {}. {} pending in-flight updates.",
 			highest_applied_update_id, channel.context.get_latest_monitor_update_id(),
 			remaining_in_flight);
-		if !channel.is_awaiting_monitor_update() || channel.context.get_latest_monitor_update_id() != highest_applied_update_id {
+		if !channel.is_awaiting_monitor_update() || remaining_in_flight != 0 {
 			return;
 		}
 		handle_monitor_update_completion!(self, peer_state_lock, peer_state, per_peer_state, channel);
-- 
2.39.5