]> git.bitcoin.ninja Git - rust-lightning/commitdiff
Ignore lockorder violation on same callsite with different construction
authorWilmer Paulino <wilmer@wilmerpaulino.com>
Wed, 22 Mar 2023 18:46:05 +0000 (11:46 -0700)
committerWilmer Paulino <wilmer@wilmerpaulino.com>
Wed, 29 Mar 2023 00:27:47 +0000 (17:27 -0700)
As long as the lock order on such locks is still valid, we should allow
them regardless of whether they were constructed at the same location or
not. Note that we can only really enforce this if we require one lock
call per line, or if we have access to symbol columns (as we do on Linux
and macOS). We opt for a smaller patch by relying on the latter.

This was previously triggered by some recent test changes to
`test_manager_serialize_deserialize_inconsistent_monitor`. When the
test ends and a node is dropped causing us to persist each, we'd detect
a possible lockorder violation deadlock across three different `Mutex`
instances that are held at the same location when serializing our
`per_peer_states` in `ChannelManager::write`.

The presumed lockorder violation happens because the first `Mutex` held
shares the same construction location with the third one, while the
second `Mutex` has a different construction location. When we hold the
second one, we consider the first as a dependency, and then consider the
second as a dependency when holding the third, causing a circular
dependency (since the third shares the same construction location as the
first).

This isn't considered a lockorder violation that could result in a
deadlock as the comment suggests inline though, since we are under a
dependent write lock which no one else can have access to.

lightning/src/sync/debug_sync.rs

index 11824d5bc73182ebf3d77fc53dd7a53b2be59045..11557be82afec418be0abce3b38117ce04582fd0 100644 (file)
@@ -74,24 +74,31 @@ struct LockDep {
        _lockdep_trace: Backtrace,
 }
 
+// Locates the frame preceding the earliest `debug_sync` frame in the call stack. This ensures we
+// can properly detect a lock's construction and acquiral callsites, since the latter may contain
+// multiple `debug_sync` frames.
 #[cfg(feature = "backtrace")]
-fn get_construction_location(backtrace: &Backtrace) -> (String, Option<u32>) {
-       // Find the first frame that is after `debug_sync` (or that is in our tests) and use
-       // that as the mutex construction site. Note that the first few frames may be in
-       // the `backtrace` crate, so we have to ignore those.
+fn locate_call_symbol(backtrace: &Backtrace) -> (String, Option<u32>) {
+       // Find the earliest `debug_sync` frame (or that is in our tests) and use the frame preceding it
+       // as the callsite. Note that the first few frames may be in the `backtrace` crate, so we have
+       // to ignore those.
        let sync_mutex_constr_regex = regex::Regex::new(r"lightning.*debug_sync").unwrap();
        let mut found_debug_sync = false;
-       for frame in backtrace.frames() {
-               for symbol in frame.symbols() {
-                       let symbol_name = symbol.name().unwrap().as_str().unwrap();
-                       if !sync_mutex_constr_regex.is_match(symbol_name) {
-                               if found_debug_sync {
-                                       return (format!("{}:{}", symbol.filename().unwrap().display(), symbol.lineno().unwrap()), symbol.colno());
-                               }
-                       } else { found_debug_sync = true; }
+       let mut symbol_after_latest_debug_sync = None;
+       for frame in backtrace.frames().iter() {
+               for symbol in frame.symbols().iter() {
+                       if let Some(symbol_name) = symbol.name().map(|name| name.as_str()).flatten() {
+                               if !sync_mutex_constr_regex.is_match(symbol_name) {
+                                       if found_debug_sync {
+                                               symbol_after_latest_debug_sync = Some(symbol);
+                                               found_debug_sync = false;
+                                       }
+                               } else { found_debug_sync = true; }
+                       }
                }
        }
-       panic!("Couldn't find mutex construction callsite");
+       let symbol = symbol_after_latest_debug_sync.expect("Couldn't find lock call symbol");
+       (format!("{}:{}", symbol.filename().unwrap().display(), symbol.lineno().unwrap()), symbol.colno())
 }
 
 impl LockMetadata {
@@ -108,13 +115,13 @@ impl LockMetadata {
                #[cfg(feature = "backtrace")]
                {
                        let (lock_constr_location, lock_constr_colno) =
-                               get_construction_location(&res._lock_construction_bt);
+                               locate_call_symbol(&res._lock_construction_bt);
                        LOCKS_INIT.call_once(|| { unsafe { LOCKS = Some(StdMutex::new(HashMap::new())); } });
                        let mut locks = unsafe { LOCKS.as_ref() }.unwrap().lock().unwrap();
                        match locks.entry(lock_constr_location) {
                                hash_map::Entry::Occupied(e) => {
                                        assert_eq!(lock_constr_colno,
-                                               get_construction_location(&e.get()._lock_construction_bt).1,
+                                               locate_call_symbol(&e.get()._lock_construction_bt).1,
                                                "Because Windows doesn't support column number results in backtraces, we cannot construct two mutexes on the same line or we risk lockorder detection false positives.");
                                        return Arc::clone(e.get())
                                },
@@ -126,9 +133,8 @@ impl LockMetadata {
 
        fn pre_lock(this: &Arc<LockMetadata>, _double_lock_self_allowed: bool) {
                LOCKS_HELD.with(|held| {
-                       // For each lock which is currently locked, check that no lock's locked-before
-                       // set includes the lock we're about to lock, which would imply a lockorder
-                       // inversion.
+                       // For each lock that is currently held, check that no lock's `locked_before` set
+                       // includes the lock we're about to hold, which would imply a lockorder inversion.
                        for (locked_idx, _locked) in held.borrow().iter() {
                                if *locked_idx == this.lock_idx {
                                        // Note that with `feature = "backtrace"` set, we may be looking at different
@@ -138,23 +144,34 @@ impl LockMetadata {
                                        #[cfg(feature = "backtrace")]
                                        debug_assert!(_double_lock_self_allowed,
                                                "Tried to acquire a lock while it was held!\nLock constructed at {}",
-                                               get_construction_location(&this._lock_construction_bt).0);
+                                               locate_call_symbol(&this._lock_construction_bt).0);
                                        #[cfg(not(feature = "backtrace"))]
                                        panic!("Tried to acquire a lock while it was held!");
                                }
                        }
                        for (_locked_idx, locked) in held.borrow().iter() {
                                for (locked_dep_idx, _locked_dep) in locked.locked_before.lock().unwrap().iter() {
-                                       if *locked_dep_idx == this.lock_idx && *locked_dep_idx != locked.lock_idx {
-                                               #[cfg(feature = "backtrace")]
-                                               panic!("Tried to violate existing lockorder.\nMutex that should be locked after the current lock was created at the following backtrace.\nNote that to get a backtrace for the lockorder violation, you should set RUST_BACKTRACE=1\nLock being taken constructed at: {} ({}):\n{:?}\nLock constructed at: {} ({})\n{:?}\n\nLock dep created at:\n{:?}\n\n",
-                                                       get_construction_location(&this._lock_construction_bt).0,
-                                                       this.lock_idx, this._lock_construction_bt,
-                                                       get_construction_location(&locked._lock_construction_bt).0,
-                                                       locked.lock_idx, locked._lock_construction_bt,
-                                                       _locked_dep._lockdep_trace);
-                                               #[cfg(not(feature = "backtrace"))]
-                                               panic!("Tried to violate existing lockorder. Build with the backtrace feature for more info.");
+                                       let is_dep_this_lock = *locked_dep_idx == this.lock_idx;
+                                       let has_same_construction = *locked_dep_idx == locked.lock_idx;
+                                       if is_dep_this_lock && !has_same_construction {
+                                               #[allow(unused_mut, unused_assignments)]
+                                               let mut has_same_callsite = false;
+                                               #[cfg(feature = "backtrace")] {
+                                                       has_same_callsite = _double_lock_self_allowed &&
+                                                               locate_call_symbol(&_locked_dep._lockdep_trace) ==
+                                                                       locate_call_symbol(&Backtrace::new());
+                                               }
+                                               if !has_same_callsite {
+                                                       #[cfg(feature = "backtrace")]
+                                                       panic!("Tried to violate existing lockorder.\nMutex that should be locked after the current lock was created at the following backtrace.\nNote that to get a backtrace for the lockorder violation, you should set RUST_BACKTRACE=1\nLock being taken constructed at: {} ({}):\n{:?}\nLock constructed at: {} ({})\n{:?}\n\nLock dep created at:\n{:?}\n\n",
+                                                               locate_call_symbol(&this._lock_construction_bt).0,
+                                                               this.lock_idx, this._lock_construction_bt,
+                                                               locate_call_symbol(&locked._lock_construction_bt).0,
+                                                               locked.lock_idx, locked._lock_construction_bt,
+                                                               _locked_dep._lockdep_trace);
+                                                       #[cfg(not(feature = "backtrace"))]
+                                                       panic!("Tried to violate existing lockorder. Build with the backtrace feature for more info.");
+                                               }
                                        }
                                }
                                // Insert any already-held locks in our locked-before set.