Don't remove nodes if there's no channel_update for a temp failure 2023-04-dont-ban-cln
authorMatt Corallo <git@bluematt.me>
Sun, 23 Apr 2023 16:17:29 +0000 (16:17 +0000)
committerMatt Corallo <git@bluematt.me>
Mon, 24 Apr 2023 18:52:05 +0000 (18:52 +0000)
Previously, we were requiring any `UPDATE` onion errors to include
a `channel_update`, as the spec mandates[1]. If we see an onion
error which is missing one we treat it as a misbehaving node that
isn't behaving according to the spec and simply remove the node.

Sadly, it appears at least some versions of CLN are such nodes, and
opt to not include `channel_update` at all if they're returning a
`temporary_channel_failure`. This causes us to completely remove
CLN nodes from our graph after they fail to forward our HTLC.

While CLN is violating the spec here, there's not a lot of reason
to not allow it, so we go ahead and do so here, treating it simply
as any other failure by letting the scorer handle it.

[1] The spec says `Please note that the channel_update field is
mandatory in messages whose failure_code includes the UPDATE flag`
however doesn't repeat it in the requirements section so its not
crazy that someone missed it when implementing.

fuzz/src/router.rs
lightning/src/ln/onion_utils.rs
lightning/src/routing/gossip.rs

index 568dcdf0250057c68e1cf946b4b543f97c8933c0..fe6f1647f4d20c182558938b2ce475d6b7d1037b 100644 (file)
@@ -227,7 +227,7 @@ pub fn do_test<Out: test_logger::Output>(data: &[u8], out: Out) {
                        },
                        4 => {
                                let short_channel_id = slice_to_be64(get_slice!(8));
-                               net_graph.channel_failed(short_channel_id, false);
+                               net_graph.channel_failed_permanent(short_channel_id);
                        },
                        _ if node_pks.is_empty() => {},
                        _ => {
index b9f36bbd8dadf94592c9082a303ef378879a4819..54b6ecdeef86d2817f2f7b4a4ead3f3dd0ebdca0 100644 (file)
@@ -493,21 +493,28 @@ pub(super) fn process_onion_failure<T: secp256k1::Signing, L: Deref>(secp_ctx: &
                                                                        } else {
                                                                                log_trace!(logger, "Failure provided features a channel update without type prefix. Deprecated, but allowing for now.");
                                                                        }
-                                                                       if let Ok(chan_update) = msgs::ChannelUpdate::read(&mut Cursor::new(&update_slice)) {
+                                                                       let update_opt = msgs::ChannelUpdate::read(&mut Cursor::new(&update_slice));
+                                                                       if update_opt.is_ok() || update_slice.is_empty() {
                                                                                // if channel_update should NOT have caused the failure:
                                                                                // MAY treat the channel_update as invalid.
                                                                                let is_chan_update_invalid = match error_code & 0xff {
                                                                                        7 => false,
-                                                                                       11 => amt_to_forward > chan_update.contents.htlc_minimum_msat,
-                                                                                       12 => amt_to_forward
-                                                                                               .checked_mul(chan_update.contents.fee_proportional_millionths as u64)
+                                                                                       11 => update_opt.is_ok() &&
+                                                                                               amt_to_forward >
+                                                                                                       update_opt.as_ref().unwrap().contents.htlc_minimum_msat,
+                                                                                       12 => update_opt.is_ok() && amt_to_forward
+                                                                                               .checked_mul(update_opt.as_ref().unwrap()
+                                                                                                       .contents.fee_proportional_millionths as u64)
                                                                                                .map(|prop_fee| prop_fee / 1_000_000)
-                                                                                               .and_then(|prop_fee| prop_fee.checked_add(chan_update.contents.fee_base_msat as u64))
+                                                                                               .and_then(|prop_fee| prop_fee.checked_add(
+                                                                                                       update_opt.as_ref().unwrap().contents.fee_base_msat as u64))
                                                                                                .map(|fee_msats| route_hop.fee_msat >= fee_msats)
                                                                                                .unwrap_or(false),
-                                                                                       13 => route_hop.cltv_expiry_delta as u16 >= chan_update.contents.cltv_expiry_delta,
+                                                                                       13 => update_opt.is_ok() &&
+                                                                                               route_hop.cltv_expiry_delta as u16 >=
+                                                                                                       update_opt.as_ref().unwrap().contents.cltv_expiry_delta,
                                                                                        14 => false, // expiry_too_soon; always valid?
-                                                                                       20 => chan_update.contents.flags & 2 == 0,
+                                                                                       20 => update_opt.as_ref().unwrap().contents.flags & 2 == 0,
                                                                                        _ => false, // unknown error code; take channel_update as valid
                                                                                };
                                                                                if is_chan_update_invalid {
@@ -518,17 +525,31 @@ pub(super) fn process_onion_failure<T: secp256k1::Signing, L: Deref>(secp_ctx: &
                                                                                                is_permanent: true,
                                                                                        });
                                                                                } else {
-                                                                                       // Make sure the ChannelUpdate contains the expected
-                                                                                       // short channel id.
-                                                                                       if failing_route_hop.short_channel_id == chan_update.contents.short_channel_id {
-                                                                                               short_channel_id = Some(failing_route_hop.short_channel_id);
+                                                                                       if let Ok(chan_update) = update_opt {
+                                                                                               // Make sure the ChannelUpdate contains the expected
+                                                                                               // short channel id.
+                                                                                               if failing_route_hop.short_channel_id == chan_update.contents.short_channel_id {
+                                                                                                       short_channel_id = Some(failing_route_hop.short_channel_id);
+                                                                                               } else {
+                                                                                                       log_info!(logger, "Node provided a channel_update for which it was not authoritative, ignoring.");
+                                                                                               }
+                                                                                               network_update = Some(NetworkUpdate::ChannelUpdateMessage {
+                                                                                                       msg: chan_update,
+                                                                                               })
                                                                                        } else {
-                                                                                               log_info!(logger, "Node provided a channel_update for which it was not authoritative, ignoring.");
+                                                                                               network_update = Some(NetworkUpdate::ChannelFailure {
+                                                                                                       short_channel_id: route_hop.short_channel_id,
+                                                                                                       is_permanent: false,
+                                                                                               });
                                                                                        }
-                                                                                       network_update = Some(NetworkUpdate::ChannelUpdateMessage {
-                                                                                               msg: chan_update,
-                                                                                       })
                                                                                };
+                                                                       } else {
+                                                                               // If the channel_update had a non-zero length (i.e. was
+                                                                               // present) but we couldn't read it, treat it as a total
+                                                                               // node failure.
+                                                                               log_info!(logger,
+                                                                                       "Failed to read a channel_update of len {} in an onion",
+                                                                                       update_slice.len());
                                                                        }
                                                                }
                                                        }
index 59268b840cd4c411b7a1545905bf17a4b6a24be0..e5f5e63c93893b114137c888b941519e28e5488f 100644 (file)
@@ -212,7 +212,7 @@ pub enum NetworkUpdate {
                msg: ChannelUpdate,
        },
        /// An error indicating that a channel failed to route a payment, which should be applied via
-       /// [`NetworkGraph::channel_failed`].
+       /// [`NetworkGraph::channel_failed_permanent`] if permanent.
        ChannelFailure {
                /// The short channel id of the closed channel.
                short_channel_id: u64,
@@ -352,9 +352,10 @@ impl<L: Deref> NetworkGraph<L> where L::Target: Logger {
                                let _ = self.update_channel(msg);
                        },
                        NetworkUpdate::ChannelFailure { short_channel_id, is_permanent } => {
-                               let action = if is_permanent { "Removing" } else { "Disabling" };
-                               log_debug!(self.logger, "{} channel graph entry for {} due to a payment failure.", action, short_channel_id);
-                               self.channel_failed(short_channel_id, is_permanent);
+                               if is_permanent {
+                                       log_debug!(self.logger, "Removing channel graph entry for {} due to a payment failure.", short_channel_id);
+                                       self.channel_failed_permanent(short_channel_id);
+                               }
                        },
                        NetworkUpdate::NodeFailure { ref node_id, is_permanent } => {
                                if is_permanent {
@@ -1632,40 +1633,27 @@ impl<L: Deref> NetworkGraph<L> where L::Target: Logger {
                Ok(())
        }
 
-       /// Marks a channel in the graph as failed if a corresponding HTLC fail was sent.
-       /// If permanent, removes a channel from the local storage.
-       /// May cause the removal of nodes too, if this was their last channel.
-       /// If not permanent, makes channels unavailable for routing.
-       pub fn channel_failed(&self, short_channel_id: u64, is_permanent: bool) {
+       /// Marks a channel in the graph as failed permanently.
+       ///
+       /// The channel and any node for which this was their last channel are removed from the graph.
+       pub fn channel_failed_permanent(&self, short_channel_id: u64) {
                #[cfg(feature = "std")]
                let current_time_unix = Some(SystemTime::now().duration_since(UNIX_EPOCH).expect("Time must be > 1970").as_secs());
                #[cfg(not(feature = "std"))]
                let current_time_unix = None;
 
-               self.channel_failed_with_time(short_channel_id, is_permanent, current_time_unix)
+               self.channel_failed_permanent_with_time(short_channel_id, current_time_unix)
        }
 
-       /// Marks a channel in the graph as failed if a corresponding HTLC fail was sent.
-       /// If permanent, removes a channel from the local storage.
-       /// May cause the removal of nodes too, if this was their last channel.
-       /// If not permanent, makes channels unavailable for routing.
-       fn channel_failed_with_time(&self, short_channel_id: u64, is_permanent: bool, current_time_unix: Option<u64>) {
+       /// Marks a channel in the graph as failed permanently.
+       ///
+       /// The channel and any node for which this was their last channel are removed from the graph.
+       fn channel_failed_permanent_with_time(&self, short_channel_id: u64, current_time_unix: Option<u64>) {
                let mut channels = self.channels.write().unwrap();
-               if is_permanent {
-                       if let Some(chan) = channels.remove(&short_channel_id) {
-                               let mut nodes = self.nodes.write().unwrap();
-                               self.removed_channels.lock().unwrap().insert(short_channel_id, current_time_unix);
-                               Self::remove_channel_in_nodes(&mut nodes, &chan, short_channel_id);
-                       }
-               } else {
-                       if let Some(chan) = channels.get_mut(&short_channel_id) {
-                               if let Some(one_to_two) = chan.one_to_two.as_mut() {
-                                       one_to_two.enabled = false;
-                               }
-                               if let Some(two_to_one) = chan.two_to_one.as_mut() {
-                                       two_to_one.enabled = false;
-                               }
-                       }
+               if let Some(chan) = channels.remove(&short_channel_id) {
+                       let mut nodes = self.nodes.write().unwrap();
+                       self.removed_channels.lock().unwrap().insert(short_channel_id, current_time_unix);
+                       Self::remove_channel_in_nodes(&mut nodes, &chan, short_channel_id);
                }
        }
 
@@ -2450,7 +2438,7 @@ pub(crate) mod tests {
                        assert!(network_graph.read_only().channels().get(&short_channel_id).unwrap().one_to_two.is_some());
                }
 
-               // Non-permanent closing just disables a channel
+               // Non-permanent failure doesn't touch the channel at all
                {
                        match network_graph.read_only().channels().get(&short_channel_id) {
                                None => panic!(),
@@ -2467,7 +2455,7 @@ pub(crate) mod tests {
                        match network_graph.read_only().channels().get(&short_channel_id) {
                                None => panic!(),
                                Some(channel_info) => {
-                                       assert!(!channel_info.one_to_two.as_ref().unwrap().enabled);
+                                       assert!(channel_info.one_to_two.as_ref().unwrap().enabled);
                                }
                        };
                }
@@ -2601,7 +2589,7 @@ pub(crate) mod tests {
 
                        // Mark the channel as permanently failed. This will also remove the two nodes
                        // and all of the entries will be tracked as removed.
-                       network_graph.channel_failed_with_time(short_channel_id, true, Some(tracking_time));
+                       network_graph.channel_failed_permanent_with_time(short_channel_id, Some(tracking_time));
 
                        // Should not remove from tracking if insufficient time has passed
                        network_graph.remove_stale_channels_and_tracking_with_time(
@@ -2634,7 +2622,7 @@ pub(crate) mod tests {
 
                        // Mark the channel as permanently failed. This will also remove the two nodes
                        // and all of the entries will be tracked as removed.
-                       network_graph.channel_failed(short_channel_id, true);
+                       network_graph.channel_failed_permanent(short_channel_id);
 
                        // The first time we call the following, the channel will have a removal time assigned.
                        network_graph.remove_stale_channels_and_tracking_with_time(removal_time);