From 2166c8a2c46f9a8c50c3dd909fc47a9558a8355b Mon Sep 17 00:00:00 2001 From: Wilmer Paulino Date: Wed, 22 Mar 2023 11:46:05 -0700 Subject: [PATCH] Ignore lockorder violation on same callsite with different construction As long as the lock order on such locks is still valid, we should allow them regardless of whether they were constructed at the same location or not. Note that we can only really enforce this if we require one lock call per line, or if we have access to symbol columns (as we do on Linux and macOS). We opt for a smaller patch by relying on the latter. This was previously triggered by some recent test changes to `test_manager_serialize_deserialize_inconsistent_monitor`. When the test ends and a node is dropped causing us to persist each, we'd detect a possible lockorder violation deadlock across three different `Mutex` instances that are held at the same location when serializing our `per_peer_states` in `ChannelManager::write`. The presumed lockorder violation happens because the first `Mutex` held shares the same construction location with the third one, while the second `Mutex` has a different construction location. When we hold the second one, we consider the first as a dependency, and then consider the second as a dependency when holding the third, causing a circular dependency (since the third shares the same construction location as the first). This isn't considered a lockorder violation that could result in a deadlock as the comment suggests inline though, since we are under a dependent write lock which no one else can have access to. --- lightning/src/sync/debug_sync.rs | 75 ++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/lightning/src/sync/debug_sync.rs b/lightning/src/sync/debug_sync.rs index 11824d5bc..11557be82 100644 --- a/lightning/src/sync/debug_sync.rs +++ b/lightning/src/sync/debug_sync.rs @@ -74,24 +74,31 @@ struct LockDep { _lockdep_trace: Backtrace, } +// Locates the frame preceding the earliest `debug_sync` frame in the call stack. This ensures we +// can properly detect a lock's construction and acquiral callsites, since the latter may contain +// multiple `debug_sync` frames. #[cfg(feature = "backtrace")] -fn get_construction_location(backtrace: &Backtrace) -> (String, Option) { - // Find the first frame that is after `debug_sync` (or that is in our tests) and use - // that as the mutex construction site. Note that the first few frames may be in - // the `backtrace` crate, so we have to ignore those. +fn locate_call_symbol(backtrace: &Backtrace) -> (String, Option) { + // Find the earliest `debug_sync` frame (or that is in our tests) and use the frame preceding it + // as the callsite. Note that the first few frames may be in the `backtrace` crate, so we have + // to ignore those. let sync_mutex_constr_regex = regex::Regex::new(r"lightning.*debug_sync").unwrap(); let mut found_debug_sync = false; - for frame in backtrace.frames() { - for symbol in frame.symbols() { - let symbol_name = symbol.name().unwrap().as_str().unwrap(); - if !sync_mutex_constr_regex.is_match(symbol_name) { - if found_debug_sync { - return (format!("{}:{}", symbol.filename().unwrap().display(), symbol.lineno().unwrap()), symbol.colno()); - } - } else { found_debug_sync = true; } + let mut symbol_after_latest_debug_sync = None; + for frame in backtrace.frames().iter() { + for symbol in frame.symbols().iter() { + if let Some(symbol_name) = symbol.name().map(|name| name.as_str()).flatten() { + if !sync_mutex_constr_regex.is_match(symbol_name) { + if found_debug_sync { + symbol_after_latest_debug_sync = Some(symbol); + found_debug_sync = false; + } + } else { found_debug_sync = true; } + } } } - panic!("Couldn't find mutex construction callsite"); + let symbol = symbol_after_latest_debug_sync.expect("Couldn't find lock call symbol"); + (format!("{}:{}", symbol.filename().unwrap().display(), symbol.lineno().unwrap()), symbol.colno()) } impl LockMetadata { @@ -108,13 +115,13 @@ impl LockMetadata { #[cfg(feature = "backtrace")] { let (lock_constr_location, lock_constr_colno) = - get_construction_location(&res._lock_construction_bt); + locate_call_symbol(&res._lock_construction_bt); LOCKS_INIT.call_once(|| { unsafe { LOCKS = Some(StdMutex::new(HashMap::new())); } }); let mut locks = unsafe { LOCKS.as_ref() }.unwrap().lock().unwrap(); match locks.entry(lock_constr_location) { hash_map::Entry::Occupied(e) => { assert_eq!(lock_constr_colno, - get_construction_location(&e.get()._lock_construction_bt).1, + locate_call_symbol(&e.get()._lock_construction_bt).1, "Because Windows doesn't support column number results in backtraces, we cannot construct two mutexes on the same line or we risk lockorder detection false positives."); return Arc::clone(e.get()) }, @@ -126,9 +133,8 @@ impl LockMetadata { fn pre_lock(this: &Arc, _double_lock_self_allowed: bool) { LOCKS_HELD.with(|held| { - // For each lock which is currently locked, check that no lock's locked-before - // set includes the lock we're about to lock, which would imply a lockorder - // inversion. + // For each lock that is currently held, check that no lock's `locked_before` set + // includes the lock we're about to hold, which would imply a lockorder inversion. for (locked_idx, _locked) in held.borrow().iter() { if *locked_idx == this.lock_idx { // Note that with `feature = "backtrace"` set, we may be looking at different @@ -138,23 +144,34 @@ impl LockMetadata { #[cfg(feature = "backtrace")] debug_assert!(_double_lock_self_allowed, "Tried to acquire a lock while it was held!\nLock constructed at {}", - get_construction_location(&this._lock_construction_bt).0); + locate_call_symbol(&this._lock_construction_bt).0); #[cfg(not(feature = "backtrace"))] panic!("Tried to acquire a lock while it was held!"); } } for (_locked_idx, locked) in held.borrow().iter() { for (locked_dep_idx, _locked_dep) in locked.locked_before.lock().unwrap().iter() { - if *locked_dep_idx == this.lock_idx && *locked_dep_idx != locked.lock_idx { - #[cfg(feature = "backtrace")] - panic!("Tried to violate existing lockorder.\nMutex that should be locked after the current lock was created at the following backtrace.\nNote that to get a backtrace for the lockorder violation, you should set RUST_BACKTRACE=1\nLock being taken constructed at: {} ({}):\n{:?}\nLock constructed at: {} ({})\n{:?}\n\nLock dep created at:\n{:?}\n\n", - get_construction_location(&this._lock_construction_bt).0, - this.lock_idx, this._lock_construction_bt, - get_construction_location(&locked._lock_construction_bt).0, - locked.lock_idx, locked._lock_construction_bt, - _locked_dep._lockdep_trace); - #[cfg(not(feature = "backtrace"))] - panic!("Tried to violate existing lockorder. Build with the backtrace feature for more info."); + let is_dep_this_lock = *locked_dep_idx == this.lock_idx; + let has_same_construction = *locked_dep_idx == locked.lock_idx; + if is_dep_this_lock && !has_same_construction { + #[allow(unused_mut, unused_assignments)] + let mut has_same_callsite = false; + #[cfg(feature = "backtrace")] { + has_same_callsite = _double_lock_self_allowed && + locate_call_symbol(&_locked_dep._lockdep_trace) == + locate_call_symbol(&Backtrace::new()); + } + if !has_same_callsite { + #[cfg(feature = "backtrace")] + panic!("Tried to violate existing lockorder.\nMutex that should be locked after the current lock was created at the following backtrace.\nNote that to get a backtrace for the lockorder violation, you should set RUST_BACKTRACE=1\nLock being taken constructed at: {} ({}):\n{:?}\nLock constructed at: {} ({})\n{:?}\n\nLock dep created at:\n{:?}\n\n", + locate_call_symbol(&this._lock_construction_bt).0, + this.lock_idx, this._lock_construction_bt, + locate_call_symbol(&locked._lock_construction_bt).0, + locked.lock_idx, locked._lock_construction_bt, + _locked_dep._lockdep_trace); + #[cfg(not(feature = "backtrace"))] + panic!("Tried to violate existing lockorder. Build with the backtrace feature for more info."); + } } } // Insert any already-held locks in our locked-before set. -- 2.39.5