From 93e645daf46f85949ae0edf60d36bf21e9fde8af Mon Sep 17 00:00:00 2001
From: Matt Corallo <git@bluematt.me>
Date: Wed, 6 Jul 2022 19:17:40 +0000
Subject: [PATCH] Track channels which a given payment part failed to traverse

When an HTLC fails, we currently rely on the scorer learning the
failed channel and assigning an infinite (`u64::max_value()`)
penalty to the channel so as to avoid retrying over the exact same
path (if there's only one available path). This is common when
trying to pay a mobile client behind an LSP if the mobile client is
currently offline.

This leads to the scorer being overly conservative in some cases -
returning `u64::max_value()` when a given path hasn't been tried
for a given payment may not be the best decision, even if that
channel failed 50 minutes ago.

By tracking channels which failed on a payment part level and
explicitly refusing to route over them we can relax the
requirements on the scorer, allowing it to make different decisions
on how to treat channels that failed relatively recently without
causing payments to retry the same path forever.

This does have the drawback that it could allow two separate part
of a payment to traverse the same path even though that path just
failed, however this should only occur if the payment is going to
fail anyway, at least as long as the scorer is properly learning.

Closes #1241, superseding #1252.
---
 lightning/src/ln/channelmanager.rs        |  9 +++-
 lightning/src/ln/functional_test_utils.rs |  5 ++-
 lightning/src/routing/router.rs           | 54 ++++++++++++++++++++---
 3 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
index bae408d29..611e8a562 100644
--- a/lightning/src/ln/channelmanager.rs
+++ b/lightning/src/ln/channelmanager.rs
@@ -3894,7 +3894,7 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
 					return;
 				}
 				mem::drop(channel_state_lock);
-				let retry = if let Some(payment_params_data) = payment_params {
+				let mut retry = if let Some(payment_params_data) = payment_params {
 					let path_last_hop = path.last().expect("Outbound payments must have had a valid path");
 					Some(RouteParameters {
 						payment_params: payment_params_data.clone(),
@@ -3930,6 +3930,9 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
 							// TODO: If we decided to blame ourselves (or one of our channels) in
 							// process_onion_failure we should close that channel as it implies our
 							// next-hop is needlessly blaming us!
+							if let Some(scid) = short_channel_id {
+								retry.as_mut().map(|r| r.payment_params.previously_failed_channels.push(scid));
+							}
 							events::Event::PaymentPathFailed {
 								payment_id: Some(payment_id),
 								payment_hash: payment_hash.clone(),
@@ -3959,6 +3962,8 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
 						// ChannelDetails.
 						// TODO: For non-temporary failures, we really should be closing the
 						// channel here as we apparently can't relay through them anyway.
+						let scid = path.first().unwrap().short_channel_id;
+						retry.as_mut().map(|r| r.payment_params.previously_failed_channels.push(scid));
 						events::Event::PaymentPathFailed {
 							payment_id: Some(payment_id),
 							payment_hash: payment_hash.clone(),
@@ -3966,7 +3971,7 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
 							network_update: None,
 							all_paths_failed,
 							path: path.clone(),
-							short_channel_id: Some(path.first().unwrap().short_channel_id),
+							short_channel_id: Some(scid),
 							retry,
 #[cfg(test)]
 							error_code: Some(*failure_code),
diff --git a/lightning/src/ln/functional_test_utils.rs b/lightning/src/ln/functional_test_utils.rs
index fe78395e2..a49e711b8 100644
--- a/lightning/src/ln/functional_test_utils.rs
+++ b/lightning/src/ln/functional_test_utils.rs
@@ -1492,7 +1492,7 @@ pub fn expect_payment_failed_conditions<'a, 'b, 'c, 'd, 'e>(
 	let mut events = node.node.get_and_clear_pending_events();
 	assert_eq!(events.len(), 1);
 	let expected_payment_id = match events.pop().unwrap() {
-		Event::PaymentPathFailed { payment_hash, rejected_by_dest, path, retry, payment_id, network_update,
+		Event::PaymentPathFailed { payment_hash, rejected_by_dest, path, retry, payment_id, network_update, short_channel_id,
 			#[cfg(test)]
 			error_code,
 			#[cfg(test)]
@@ -1502,6 +1502,9 @@ pub fn expect_payment_failed_conditions<'a, 'b, 'c, 'd, 'e>(
 			assert!(retry.is_some(), "expected retry.is_some()");
 			assert_eq!(retry.as_ref().unwrap().final_value_msat, path.last().unwrap().fee_msat, "Retry amount should match last hop in path");
 			assert_eq!(retry.as_ref().unwrap().payment_params.payee_pubkey, path.last().unwrap().pubkey, "Retry payee node_id should match last hop in path");
+			if let Some(scid) = short_channel_id {
+				assert!(retry.as_ref().unwrap().payment_params.previously_failed_channels.contains(&scid));
+			}
 
 			#[cfg(test)]
 			{
diff --git a/lightning/src/routing/router.rs b/lightning/src/routing/router.rs
index 27a21a189..a97e1b6ff 100644
--- a/lightning/src/routing/router.rs
+++ b/lightning/src/routing/router.rs
@@ -240,6 +240,11 @@ pub struct PaymentParameters {
 	///
 	/// Default value: 1
 	pub max_channel_saturation_power_of_half: u8,
+
+	/// A list of SCIDs which this payment was previously attempted over and which caused the
+	/// payment to fail. Future attempts for the same payment shouldn't be relayed through any of
+	/// these SCIDs.
+	pub previously_failed_channels: Vec<u64>,
 }
 
 impl_writeable_tlv_based!(PaymentParameters, {
@@ -250,6 +255,7 @@ impl_writeable_tlv_based!(PaymentParameters, {
 	(4, route_hints, vec_type),
 	(5, max_channel_saturation_power_of_half, (default_value, 1)),
 	(6, expiry_time, option),
+	(7, previously_failed_channels, vec_type),
 });
 
 impl PaymentParameters {
@@ -263,6 +269,7 @@ impl PaymentParameters {
 			max_total_cltv_expiry_delta: DEFAULT_MAX_TOTAL_CLTV_EXPIRY_DELTA,
 			max_path_count: DEFAULT_MAX_PATH_COUNT,
 			max_channel_saturation_power_of_half: 1,
+			previously_failed_channels: Vec::new(),
 		}
 	}
 
@@ -1002,7 +1009,7 @@ where L::Target: Logger {
 					let contributes_sufficient_value = available_value_contribution_msat >= minimal_value_contribution_msat;
 					// Do not consider candidate hops that would exceed the maximum path length.
 					let path_length_to_node = $next_hops_path_length + 1;
-					let doesnt_exceed_max_path_length = path_length_to_node <= MAX_PATH_LENGTH_ESTIMATE;
+					let exceeds_max_path_length = path_length_to_node > MAX_PATH_LENGTH_ESTIMATE;
 
 					// Do not consider candidates that exceed the maximum total cltv expiry limit.
 					// In order to already account for some of the privacy enhancing random CLTV
@@ -1013,7 +1020,7 @@ where L::Target: Logger {
 						.unwrap_or(payment_params.max_total_cltv_expiry_delta - final_cltv_expiry_delta);
 					let hop_total_cltv_delta = ($next_hops_cltv_delta as u32)
 						.saturating_add($candidate.cltv_expiry_delta());
-					let doesnt_exceed_cltv_delta_limit = hop_total_cltv_delta <= max_total_cltv_expiry_delta;
+					let exceeds_cltv_delta_limit = hop_total_cltv_delta > max_total_cltv_expiry_delta;
 
 					let value_contribution_msat = cmp::min(available_value_contribution_msat, $next_hops_value_contribution);
 					// Includes paying fees for the use of the following channels.
@@ -1033,15 +1040,19 @@ where L::Target: Logger {
 						 (amount_to_transfer_over_msat < $next_hops_path_htlc_minimum_msat &&
 						  recommended_value_msat > $next_hops_path_htlc_minimum_msat));
 
+					let payment_failed_on_this_channel =
+						payment_params.previously_failed_channels.contains(&short_channel_id);
+
 					// If HTLC minimum is larger than the amount we're going to transfer, we shouldn't
 					// bother considering this channel. If retrying with recommended_value_msat may
 					// allow us to hit the HTLC minimum limit, set htlc_minimum_limit so that we go
 					// around again with a higher amount.
-					if contributes_sufficient_value && doesnt_exceed_max_path_length &&
-						doesnt_exceed_cltv_delta_limit && may_overpay_to_meet_path_minimum_msat {
+					if !contributes_sufficient_value || exceeds_max_path_length ||
+						exceeds_cltv_delta_limit || payment_failed_on_this_channel {
+						// Path isn't useful, ignore it and move on.
+					} else if may_overpay_to_meet_path_minimum_msat {
 						hit_minimum_limit = true;
-					} else if contributes_sufficient_value && doesnt_exceed_max_path_length &&
-						doesnt_exceed_cltv_delta_limit && over_path_minimum_msat {
+					} else if over_path_minimum_msat {
 						// Note that low contribution here (limited by available_liquidity_msat)
 						// might violate htlc_minimum_msat on the hops which are next along the
 						// payment path (upstream to the payee). To avoid that, we recompute
@@ -1993,6 +2004,8 @@ mod tests {
 	use prelude::*;
 	use sync::{self, Arc};
 
+	use core::convert::TryInto;
+
 	fn get_channel_details(short_channel_id: Option<u64>, node_id: PublicKey,
 			features: InitFeatures, outbound_capacity_msat: u64) -> channelmanager::ChannelDetails {
 		channelmanager::ChannelDetails {
@@ -5573,6 +5586,35 @@ mod tests {
 		}
 	}
 
+	#[test]
+	fn avoids_recently_failed_paths() {
+		// Ensure that the router always avoids all of the `previously_failed_channels` channels by
+		// randomly inserting channels into it until we can't find a route anymore.
+		let (secp_ctx, network, _, _, logger) = build_graph();
+		let (_, our_id, _, nodes) = get_nodes(&secp_ctx);
+		let network_graph = network.read_only();
+
+		let scorer = test_utils::TestScorer::with_penalty(0);
+		let mut payment_params = PaymentParameters::from_node_id(nodes[6]).with_route_hints(last_hops(&nodes))
+			.with_max_path_count(1);
+		let keys_manager = test_utils::TestKeysInterface::new(&[0u8; 32], Network::Testnet);
+		let random_seed_bytes = keys_manager.get_secure_random_bytes();
+
+		// We should be able to find a route initially, and then after we fail a few random
+		// channels eventually we won't be able to any longer.
+		assert!(get_route(&our_id, &payment_params, &network_graph, None, 100, 0, Arc::clone(&logger), &scorer, &random_seed_bytes).is_ok());
+		loop {
+			if let Ok(route) = get_route(&our_id, &payment_params, &network_graph, None, 100, 0, Arc::clone(&logger), &scorer, &random_seed_bytes) {
+				for chan in route.paths[0].iter() {
+					assert!(!payment_params.previously_failed_channels.contains(&chan.short_channel_id));
+				}
+				let victim = (u64::from_ne_bytes(random_seed_bytes[0..8].try_into().unwrap()) as usize)
+					% route.paths[0].len();
+				payment_params.previously_failed_channels.push(route.paths[0][victim].short_channel_id);
+			} else { break; }
+		}
+	}
+
 	#[test]
 	fn limits_path_length() {
 		let (secp_ctx, network, _, _, logger) = build_line_graph();
-- 
2.39.5