Merge pull request #1425 from valentinewallace/2021-04-wumbo
[rust-lightning] / lightning / src / chain / chainmonitor.rs
index 9e92264b0425dffe07049d9acaf4c80e75df20bc..aae260e735bdbae5c0a538af8024e7e2ef2a78cb 100644 (file)
 //! events. The remote server would make use of [`ChainMonitor`] for block processing and for
 //! servicing [`ChannelMonitor`] updates from the client.
 
-use bitcoin::blockdata::block::{Block, BlockHeader};
+use bitcoin::blockdata::block::BlockHeader;
 use bitcoin::hash_types::Txid;
 
 use chain;
 use chain::{ChannelMonitorUpdateErr, Filter, WatchedOutput};
 use chain::chaininterface::{BroadcasterInterface, FeeEstimator};
-use chain::channelmonitor::{ChannelMonitor, ChannelMonitorUpdate, Balance, MonitorEvent, TransactionOutputs};
+use chain::channelmonitor::{ChannelMonitor, ChannelMonitorUpdate, Balance, MonitorEvent, TransactionOutputs, LATENCY_GRACE_PERIOD_BLOCKS};
 use chain::transaction::{OutPoint, TransactionData};
 use chain::keysinterface::Sign;
+use util::atomic_counter::AtomicCounter;
 use util::logger::Logger;
+use util::errors::APIError;
 use util::events;
 use util::events::EventHandler;
 use ln::channelmanager::ChannelDetails;
 
 use prelude::*;
-use sync::{RwLock, RwLockReadGuard, Mutex};
+use sync::{RwLock, RwLockReadGuard, Mutex, MutexGuard};
 use core::ops::Deref;
+use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+
+#[derive(Clone, Copy, Hash, PartialEq, Eq)]
+/// A specific update's ID stored in a `MonitorUpdateId`, separated out to make the contents
+/// entirely opaque.
+enum UpdateOrigin {
+       /// An update that was generated by the `ChannelManager` (via our `chain::Watch`
+       /// implementation). This corresponds to an actual [`ChannelMonitorUpdate::update_id`] field
+       /// and [`ChannelMonitor::get_latest_update_id`].
+       OffChain(u64),
+       /// An update that was generated during blockchain processing. The ID here is specific to the
+       /// generating [`ChainMonitor`] and does *not* correspond to any on-disk IDs.
+       ChainSync(u64),
+}
+
+/// An opaque identifier describing a specific [`Persist`] method call.
+#[derive(Clone, Copy, Hash, PartialEq, Eq)]
+pub struct MonitorUpdateId {
+       contents: UpdateOrigin,
+}
+
+impl MonitorUpdateId {
+       pub(crate) fn from_monitor_update(update: &ChannelMonitorUpdate) -> Self {
+               Self { contents: UpdateOrigin::OffChain(update.update_id) }
+       }
+       pub(crate) fn from_new_monitor<ChannelSigner: Sign>(monitor: &ChannelMonitor<ChannelSigner>) -> Self {
+               Self { contents: UpdateOrigin::OffChain(monitor.get_latest_update_id()) }
+       }
+}
 
 /// `Persist` defines behavior for persisting channel monitors: this could mean
 /// writing once to disk, and/or uploading to one or more backup services.
 ///
-/// Note that for every new monitor, you **must** persist the new `ChannelMonitor`
-/// to disk/backups. And, on every update, you **must** persist either the
-/// `ChannelMonitorUpdate` or the updated monitor itself. Otherwise, there is risk
-/// of situations such as revoking a transaction, then crashing before this
-/// revocation can be persisted, then unintentionally broadcasting a revoked
-/// transaction and losing money. This is a risk because previous channel states
-/// are toxic, so it's important that whatever channel state is persisted is
-/// kept up-to-date.
+/// Each method can return three possible values:
+///  * If persistence (including any relevant `fsync()` calls) happens immediately, the
+///    implementation should return `Ok(())`, indicating normal channel operation should continue.
+///  * If persistence happens asynchronously, implementations should first ensure the
+///    [`ChannelMonitor`] or [`ChannelMonitorUpdate`] are written durably to disk, and then return
+///    `Err(ChannelMonitorUpdateErr::TemporaryFailure)` while the update continues in the
+///    background. Once the update completes, [`ChainMonitor::channel_monitor_updated`] should be
+///    called with the corresponding [`MonitorUpdateId`].
+///
+///    Note that unlike the direct [`chain::Watch`] interface,
+///    [`ChainMonitor::channel_monitor_updated`] must be called once for *each* update which occurs.
+///
+///  * If persistence fails for some reason, implementations should return
+///    `Err(ChannelMonitorUpdateErr::PermanentFailure)`, in which case the channel will likely be
+///    closed without broadcasting the latest state. See
+///    [`ChannelMonitorUpdateErr::PermanentFailure`] for more details.
 pub trait Persist<ChannelSigner: Sign> {
-       /// Persist a new channel's data. The data can be stored any way you want, but
-       /// the identifier provided by Rust-Lightning is the channel's outpoint (and
-       /// it is up to you to maintain a correct mapping between the outpoint and the
-       /// stored channel data). Note that you **must** persist every new monitor to
-       /// disk. See the `Persist` trait documentation for more details.
+       /// Persist a new channel's data in response to a [`chain::Watch::watch_channel`] call. This is
+       /// called by [`ChannelManager`] for new channels, or may be called directly, e.g. on startup.
+       ///
+       /// The data can be stored any way you want, but the identifier provided by LDK is the
+       /// channel's outpoint (and it is up to you to maintain a correct mapping between the outpoint
+       /// and the stored channel data). Note that you **must** persist every new monitor to disk.
+       ///
+       /// The `update_id` is used to identify this call to [`ChainMonitor::channel_monitor_updated`],
+       /// if you return [`ChannelMonitorUpdateErr::TemporaryFailure`].
        ///
        /// See [`Writeable::write`] on [`ChannelMonitor`] for writing out a `ChannelMonitor`
        /// and [`ChannelMonitorUpdateErr`] for requirements when returning errors.
        ///
+       /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager
        /// [`Writeable::write`]: crate::util::ser::Writeable::write
-       fn persist_new_channel(&self, id: OutPoint, data: &ChannelMonitor<ChannelSigner>) -> Result<(), ChannelMonitorUpdateErr>;
+       fn persist_new_channel(&self, channel_id: OutPoint, data: &ChannelMonitor<ChannelSigner>, update_id: MonitorUpdateId) -> Result<(), ChannelMonitorUpdateErr>;
 
-       /// Update one channel's data. The provided `ChannelMonitor` has already
-       /// applied the given update.
+       /// Update one channel's data. The provided [`ChannelMonitor`] has already applied the given
+       /// update.
        ///
-       /// Note that on every update, you **must** persist either the
-       /// `ChannelMonitorUpdate` or the updated monitor itself to disk/backups. See
-       /// the `Persist` trait documentation for more details.
+       /// Note that on every update, you **must** persist either the [`ChannelMonitorUpdate`] or the
+       /// updated monitor itself to disk/backups. See the [`Persist`] trait documentation for more
+       /// details.
+       ///
+       /// During blockchain synchronization operations, this may be called with no
+       /// [`ChannelMonitorUpdate`], in which case the full [`ChannelMonitor`] needs to be persisted.
+       /// Note that after the full [`ChannelMonitor`] is persisted any previous
+       /// [`ChannelMonitorUpdate`]s which were persisted should be discarded - they can no longer be
+       /// applied to the persisted [`ChannelMonitor`] as they were already applied.
        ///
        /// If an implementer chooses to persist the updates only, they need to make
        /// sure that all the updates are applied to the `ChannelMonitors` *before*
@@ -84,16 +134,61 @@ pub trait Persist<ChannelSigner: Sign> {
        /// them in batches. The size of each monitor grows `O(number of state updates)`
        /// whereas updates are small and `O(1)`.
        ///
+       /// The `update_id` is used to identify this call to [`ChainMonitor::channel_monitor_updated`],
+       /// if you return [`ChannelMonitorUpdateErr::TemporaryFailure`].
+       ///
        /// See [`Writeable::write`] on [`ChannelMonitor`] for writing out a `ChannelMonitor`,
        /// [`Writeable::write`] on [`ChannelMonitorUpdate`] for writing out an update, and
        /// [`ChannelMonitorUpdateErr`] for requirements when returning errors.
        ///
        /// [`Writeable::write`]: crate::util::ser::Writeable::write
-       fn update_persisted_channel(&self, id: OutPoint, update: &ChannelMonitorUpdate, data: &ChannelMonitor<ChannelSigner>) -> Result<(), ChannelMonitorUpdateErr>;
+       fn update_persisted_channel(&self, channel_id: OutPoint, update: &Option<ChannelMonitorUpdate>, data: &ChannelMonitor<ChannelSigner>, update_id: MonitorUpdateId) -> Result<(), ChannelMonitorUpdateErr>;
 }
 
 struct MonitorHolder<ChannelSigner: Sign> {
        monitor: ChannelMonitor<ChannelSigner>,
+       /// The full set of pending monitor updates for this Channel.
+       ///
+       /// Note that this lock must be held during updates to prevent a race where we call
+       /// update_persisted_channel, the user returns a TemporaryFailure, and then calls
+       /// channel_monitor_updated immediately, racing our insertion of the pending update into the
+       /// contained Vec.
+       ///
+       /// Beyond the synchronization of updates themselves, we cannot handle user events until after
+       /// any chain updates have been stored on disk. Thus, we scan this list when returning updates
+       /// to the ChannelManager, refusing to return any updates for a ChannelMonitor which is still
+       /// being persisted fully to disk after a chain update.
+       ///
+       /// This avoids the possibility of handling, e.g. an on-chain claim, generating a claim monitor
+       /// event, resulting in the relevant ChannelManager generating a PaymentSent event and dropping
+       /// the pending payment entry, and then reloading before the monitor is persisted, resulting in
+       /// the ChannelManager re-adding the same payment entry, before the same block is replayed,
+       /// resulting in a duplicate PaymentSent event.
+       pending_monitor_updates: Mutex<Vec<MonitorUpdateId>>,
+       /// When the user returns a PermanentFailure error from an update_persisted_channel call during
+       /// block processing, we inform the ChannelManager that the channel should be closed
+       /// asynchronously. In order to ensure no further changes happen before the ChannelManager has
+       /// processed the closure event, we set this to true and return PermanentFailure for any other
+       /// chain::Watch events.
+       channel_perm_failed: AtomicBool,
+       /// The last block height at which no [`UpdateOrigin::ChainSync`] monitor updates were present
+       /// in `pending_monitor_updates`.
+       /// If it's been more than [`LATENCY_GRACE_PERIOD_BLOCKS`] since we started waiting on a chain
+       /// sync event, we let monitor events return to `ChannelManager` because we cannot hold them up
+       /// forever or we'll end up with HTLC preimages waiting to feed back into an upstream channel
+       /// forever, risking funds loss.
+       last_chain_persist_height: AtomicUsize,
+}
+
+impl<ChannelSigner: Sign> MonitorHolder<ChannelSigner> {
+       fn has_pending_offchain_updates(&self, pending_monitor_updates_lock: &MutexGuard<Vec<MonitorUpdateId>>) -> bool {
+               pending_monitor_updates_lock.iter().any(|update_id|
+                       if let UpdateOrigin::OffChain(_) = update_id.contents { true } else { false })
+       }
+       fn has_pending_chainsync_updates(&self, pending_monitor_updates_lock: &MutexGuard<Vec<MonitorUpdateId>>) -> bool {
+               pending_monitor_updates_lock.iter().any(|update_id|
+                       if let UpdateOrigin::ChainSync(_) = update_id.contents { true } else { false })
+       }
 }
 
 /// A read-only reference to a current ChannelMonitor.
@@ -129,12 +224,20 @@ pub struct ChainMonitor<ChannelSigner: Sign, C: Deref, T: Deref, F: Deref, L: De
         P::Target: Persist<ChannelSigner>,
 {
        monitors: RwLock<HashMap<OutPoint, MonitorHolder<ChannelSigner>>>,
+       /// When we generate a [`MonitorUpdateId`] for a chain-event monitor persistence, we need a
+       /// unique ID, which we calculate by simply getting the next value from this counter. Note that
+       /// the ID is never persisted so it's ok that they reset on restart.
+       sync_persistence_id: AtomicCounter,
        chain_source: Option<C>,
        broadcaster: T,
        logger: L,
        fee_estimator: F,
        persister: P,
+       /// "User-provided" (ie persistence-completion/-failed) [`MonitorEvent`]s. These came directly
+       /// from the user and not from a [`ChannelMonitor`].
        pending_monitor_events: Mutex<Vec<MonitorEvent>>,
+       /// The best block height seen, used as a proxy for the passage of time.
+       highest_chain_height: AtomicUsize,
 }
 
 impl<ChannelSigner: Sign, C: Deref, T: Deref, F: Deref, L: Deref, P: Deref> ChainMonitor<ChannelSigner, C, T, F, L, P>
@@ -153,31 +256,75 @@ where C::Target: chain::Filter,
        /// calls must not exclude any transactions matching the new outputs nor any in-block
        /// descendants of such transactions. It is not necessary to re-fetch the block to obtain
        /// updated `txdata`.
-       fn process_chain_data<FN>(&self, header: &BlockHeader, txdata: &TransactionData, process: FN)
+       ///
+       /// Calls which represent a new blockchain tip height should set `best_height`.
+       fn process_chain_data<FN>(&self, header: &BlockHeader, best_height: Option<u32>, txdata: &TransactionData, process: FN)
        where
                FN: Fn(&ChannelMonitor<ChannelSigner>, &TransactionData) -> Vec<TransactionOutputs>
        {
                let mut dependent_txdata = Vec::new();
-               let monitor_states = self.monitors.read().unwrap();
-               for monitor_state in monitor_states.values() {
-                       let mut txn_outputs = process(&monitor_state.monitor, txdata);
-
-                       // Register any new outputs with the chain source for filtering, storing any dependent
-                       // transactions from within the block that previously had not been included in txdata.
-                       if let Some(ref chain_source) = self.chain_source {
-                               let block_hash = header.block_hash();
-                               for (txid, mut outputs) in txn_outputs.drain(..) {
-                                       for (idx, output) in outputs.drain(..) {
-                                               // Register any new outputs with the chain source for filtering and recurse
-                                               // if it indicates that there are dependent transactions within the block
-                                               // that had not been previously included in txdata.
-                                               let output = WatchedOutput {
-                                                       block_hash: Some(block_hash),
-                                                       outpoint: OutPoint { txid, index: idx as u16 },
-                                                       script_pubkey: output.script_pubkey,
-                                               };
-                                               if let Some(tx) = chain_source.register_output(output) {
-                                                       dependent_txdata.push(tx);
+               {
+                       let monitor_states = self.monitors.write().unwrap();
+                       if let Some(height) = best_height {
+                               // If the best block height is being updated, update highest_chain_height under the
+                               // monitors write lock.
+                               let old_height = self.highest_chain_height.load(Ordering::Acquire);
+                               let new_height = height as usize;
+                               if new_height > old_height {
+                                       self.highest_chain_height.store(new_height, Ordering::Release);
+                               }
+                       }
+
+                       for (funding_outpoint, monitor_state) in monitor_states.iter() {
+                               let monitor = &monitor_state.monitor;
+                               let mut txn_outputs;
+                               {
+                                       txn_outputs = process(monitor, txdata);
+                                       let update_id = MonitorUpdateId {
+                                               contents: UpdateOrigin::ChainSync(self.sync_persistence_id.get_increment()),
+                                       };
+                                       let mut pending_monitor_updates = monitor_state.pending_monitor_updates.lock().unwrap();
+                                       if let Some(height) = best_height {
+                                               if !monitor_state.has_pending_chainsync_updates(&pending_monitor_updates) {
+                                                       // If there are not ChainSync persists awaiting completion, go ahead and
+                                                       // set last_chain_persist_height here - we wouldn't want the first
+                                                       // TemporaryFailure to always immediately be considered "overly delayed".
+                                                       monitor_state.last_chain_persist_height.store(height as usize, Ordering::Release);
+                                               }
+                                       }
+
+                                       log_trace!(self.logger, "Syncing Channel Monitor for channel {}", log_funding_info!(monitor));
+                                       match self.persister.update_persisted_channel(*funding_outpoint, &None, monitor, update_id) {
+                                               Ok(()) =>
+                                                       log_trace!(self.logger, "Finished syncing Channel Monitor for channel {}", log_funding_info!(monitor)),
+                                               Err(ChannelMonitorUpdateErr::PermanentFailure) => {
+                                                       monitor_state.channel_perm_failed.store(true, Ordering::Release);
+                                                       self.pending_monitor_events.lock().unwrap().push(MonitorEvent::UpdateFailed(*funding_outpoint));
+                                               },
+                                               Err(ChannelMonitorUpdateErr::TemporaryFailure) => {
+                                                       log_debug!(self.logger, "Channel Monitor sync for channel {} in progress, holding events until completion!", log_funding_info!(monitor));
+                                                       pending_monitor_updates.push(update_id);
+                                               },
+                                       }
+                               }
+
+                               // Register any new outputs with the chain source for filtering, storing any dependent
+                               // transactions from within the block that previously had not been included in txdata.
+                               if let Some(ref chain_source) = self.chain_source {
+                                       let block_hash = header.block_hash();
+                                       for (txid, mut outputs) in txn_outputs.drain(..) {
+                                               for (idx, output) in outputs.drain(..) {
+                                                       // Register any new outputs with the chain source for filtering and recurse
+                                                       // if it indicates that there are dependent transactions within the block
+                                                       // that had not been previously included in txdata.
+                                                       let output = WatchedOutput {
+                                                               block_hash: Some(block_hash),
+                                                               outpoint: OutPoint { txid, index: idx as u16 },
+                                                               script_pubkey: output.script_pubkey,
+                                                       };
+                                                       if let Some(tx) = chain_source.register_output(output) {
+                                                               dependent_txdata.push(tx);
+                                                       }
                                                }
                                        }
                                }
@@ -189,7 +336,7 @@ where C::Target: chain::Filter,
                        dependent_txdata.sort_unstable_by_key(|(index, _tx)| *index);
                        dependent_txdata.dedup_by_key(|(index, _tx)| *index);
                        let txdata: Vec<_> = dependent_txdata.iter().map(|(index, tx)| (*index, tx)).collect();
-                       self.process_chain_data(header, &txdata, process);
+                       self.process_chain_data(header, None, &txdata, process); // We skip the best height the second go-around
                }
        }
 
@@ -203,12 +350,14 @@ where C::Target: chain::Filter,
        pub fn new(chain_source: Option<C>, broadcaster: T, logger: L, feeest: F, persister: P) -> Self {
                Self {
                        monitors: RwLock::new(HashMap::new()),
+                       sync_persistence_id: AtomicCounter::new(),
                        chain_source,
                        broadcaster,
                        logger,
                        fee_estimator: feeest,
                        persister,
                        pending_monitor_events: Mutex::new(Vec::new()),
+                       highest_chain_height: AtomicUsize::new(0),
                }
        }
 
@@ -267,27 +416,73 @@ where C::Target: chain::Filter,
        /// Indicates the persistence of a [`ChannelMonitor`] has completed after
        /// [`ChannelMonitorUpdateErr::TemporaryFailure`] was returned from an update operation.
        ///
-       /// All ChannelMonitor updates up to and including highest_applied_update_id must have been
-       /// fully committed in every copy of the given channels' ChannelMonitors.
-       ///
-       /// Note that there is no effect to calling with a highest_applied_update_id other than the
-       /// current latest ChannelMonitorUpdate and one call to this function after multiple
-       /// ChannelMonitorUpdateErr::TemporaryFailures is fine. The highest_applied_update_id field
-       /// exists largely only to prevent races between this and concurrent update_monitor calls.
-       ///
        /// Thus, the anticipated use is, at a high level:
        ///  1) This [`ChainMonitor`] calls [`Persist::update_persisted_channel`] which stores the
        ///     update to disk and begins updating any remote (e.g. watchtower/backup) copies,
        ///     returning [`ChannelMonitorUpdateErr::TemporaryFailure`],
-       ///  2) once all remote copies are updated, you call this function with the update_id that
-       ///     completed, and once it is the latest the Channel will be re-enabled.
-       pub fn channel_monitor_updated(&self, funding_txo: OutPoint, highest_applied_update_id: u64) {
+       ///  2) once all remote copies are updated, you call this function with the
+       ///     `completed_update_id` that completed, and once all pending updates have completed the
+       ///     channel will be re-enabled.
+       //      Note that we re-enable only after `UpdateOrigin::OffChain` updates complete, we don't
+       //      care about `UpdateOrigin::ChainSync` updates for the channel state being updated. We
+       //      only care about `UpdateOrigin::ChainSync` for returning `MonitorEvent`s.
+       ///
+       /// Returns an [`APIError::APIMisuseError`] if `funding_txo` does not match any currently
+       /// registered [`ChannelMonitor`]s.
+       pub fn channel_monitor_updated(&self, funding_txo: OutPoint, completed_update_id: MonitorUpdateId) -> Result<(), APIError> {
+               let monitors = self.monitors.read().unwrap();
+               let monitor_data = if let Some(mon) = monitors.get(&funding_txo) { mon } else {
+                       return Err(APIError::APIMisuseError { err: format!("No ChannelMonitor matching funding outpoint {:?} found", funding_txo) });
+               };
+               let mut pending_monitor_updates = monitor_data.pending_monitor_updates.lock().unwrap();
+               pending_monitor_updates.retain(|update_id| *update_id != completed_update_id);
+
+               match completed_update_id {
+                       MonitorUpdateId { contents: UpdateOrigin::OffChain(_) } => {
+                               // Note that we only check for `UpdateOrigin::OffChain` failures here - if
+                               // we're being told that a `UpdateOrigin::OffChain` monitor update completed,
+                               // we only care about ensuring we don't tell the `ChannelManager` to restore
+                               // the channel to normal operation until all `UpdateOrigin::OffChain` updates
+                               // complete.
+                               // If there's some `UpdateOrigin::ChainSync` update still pending that's okay
+                               // - we can still update our channel state, just as long as we don't return
+                               // `MonitorEvent`s from the monitor back to the `ChannelManager` until they
+                               // complete.
+                               let monitor_is_pending_updates = monitor_data.has_pending_offchain_updates(&pending_monitor_updates);
+                               if monitor_is_pending_updates || monitor_data.channel_perm_failed.load(Ordering::Acquire) {
+                                       // If there are still monitor updates pending (or an old monitor update
+                                       // finished after a later one perm-failed), we cannot yet construct an
+                                       // UpdateCompleted event.
+                                       return Ok(());
+                               }
+                               self.pending_monitor_events.lock().unwrap().push(MonitorEvent::UpdateCompleted {
+                                       funding_txo,
+                                       monitor_update_id: monitor_data.monitor.get_latest_update_id(),
+                               });
+                       },
+                       MonitorUpdateId { contents: UpdateOrigin::ChainSync(_) } => {
+                               if !monitor_data.has_pending_chainsync_updates(&pending_monitor_updates) {
+                                       monitor_data.last_chain_persist_height.store(self.highest_chain_height.load(Ordering::Acquire), Ordering::Release);
+                                       // The next time release_pending_monitor_events is called, any events for this
+                                       // ChannelMonitor will be returned.
+                               }
+                       },
+               }
+               Ok(())
+       }
+
+       /// This wrapper avoids having to update some of our tests for now as they assume the direct
+       /// chain::Watch API wherein we mark a monitor fully-updated by just calling
+       /// channel_monitor_updated once with the highest ID.
+       #[cfg(any(test, fuzzing))]
+       pub fn force_channel_monitor_updated(&self, funding_txo: OutPoint, monitor_update_id: u64) {
                self.pending_monitor_events.lock().unwrap().push(MonitorEvent::UpdateCompleted {
-                       funding_txo, monitor_update_id: highest_applied_update_id
+                       funding_txo,
+                       monitor_update_id,
                });
        }
 
-       #[cfg(any(test, feature = "fuzztarget", feature = "_test_utils"))]
+       #[cfg(any(test, fuzzing, feature = "_test_utils"))]
        pub fn get_and_clear_pending_events(&self) -> Vec<events::Event> {
                use util::events::EventsProvider;
                let events = core::cell::RefCell::new(Vec::new());
@@ -306,11 +501,9 @@ where
        L::Target: Logger,
        P::Target: Persist<ChannelSigner>,
 {
-       fn block_connected(&self, block: &Block, height: u32) {
-               let header = &block.header;
-               let txdata: Vec<_> = block.txdata.iter().enumerate().collect();
+       fn filtered_block_connected(&self, header: &BlockHeader, txdata: &TransactionData, height: u32) {
                log_debug!(self.logger, "New best block {} at height {} provided via block_connected", header.block_hash(), height);
-               self.process_chain_data(header, &txdata, |monitor, txdata| {
+               self.process_chain_data(header, Some(height), &txdata, |monitor, txdata| {
                        monitor.block_connected(
                                header, txdata, height, &*self.broadcaster, &*self.fee_estimator, &*self.logger)
                });
@@ -337,7 +530,7 @@ where
 {
        fn transactions_confirmed(&self, header: &BlockHeader, txdata: &TransactionData, height: u32) {
                log_debug!(self.logger, "{} provided transactions confirmed at height {} in block {}", txdata.len(), height, header.block_hash());
-               self.process_chain_data(header, txdata, |monitor, txdata| {
+               self.process_chain_data(header, None, txdata, |monitor, txdata| {
                        monitor.transactions_confirmed(
                                header, txdata, height, &*self.broadcaster, &*self.fee_estimator, &*self.logger)
                });
@@ -353,7 +546,7 @@ where
 
        fn best_block_updated(&self, header: &BlockHeader, height: u32) {
                log_debug!(self.logger, "New best block {} at height {} provided via best_block_updated", header.block_hash(), height);
-               self.process_chain_data(header, &[], |monitor, txdata| {
+               self.process_chain_data(header, Some(height), &[], |monitor, txdata| {
                        // While in practice there shouldn't be any recursive calls when given empty txdata,
                        // it's still possible if a chain::Filter implementation returns a transaction.
                        debug_assert!(txdata.is_empty());
@@ -397,22 +590,29 @@ where C::Target: chain::Filter,
                                return Err(ChannelMonitorUpdateErr::PermanentFailure)},
                        hash_map::Entry::Vacant(e) => e,
                };
-               let persist_res = self.persister.persist_new_channel(funding_outpoint, &monitor);
+               log_trace!(self.logger, "Got new ChannelMonitor for channel {}", log_funding_info!(monitor));
+               let update_id = MonitorUpdateId::from_new_monitor(&monitor);
+               let mut pending_monitor_updates = Vec::new();
+               let persist_res = self.persister.persist_new_channel(funding_outpoint, &monitor, update_id);
                if persist_res.is_err() {
-                       log_error!(self.logger, "Failed to persist new channel data: {:?}", persist_res);
+                       log_error!(self.logger, "Failed to persist new ChannelMonitor for channel {}: {:?}", log_funding_info!(monitor), persist_res);
+               } else {
+                       log_trace!(self.logger, "Finished persisting new ChannelMonitor for channel {}", log_funding_info!(monitor));
                }
                if persist_res == Err(ChannelMonitorUpdateErr::PermanentFailure) {
                        return persist_res;
+               } else if persist_res.is_err() {
+                       pending_monitor_updates.push(update_id);
                }
-               {
-                       let funding_txo = monitor.get_funding_txo();
-                       log_trace!(self.logger, "Got new Channel Monitor for channel {}", log_bytes!(funding_txo.0.to_channel_id()[..]));
-
-                       if let Some(ref chain_source) = self.chain_source {
-                               monitor.load_outputs_to_watch(chain_source);
-                       }
+               if let Some(ref chain_source) = self.chain_source {
+                       monitor.load_outputs_to_watch(chain_source);
                }
-               entry.insert(MonitorHolder { monitor });
+               entry.insert(MonitorHolder {
+                       monitor,
+                       pending_monitor_updates: Mutex::new(pending_monitor_updates),
+                       channel_perm_failed: AtomicBool::new(false),
+                       last_chain_persist_height: AtomicUsize::new(self.highest_chain_height.load(Ordering::Acquire)),
+               });
                persist_res
        }
 
@@ -428,26 +628,37 @@ where C::Target: chain::Filter,
                                // We should never ever trigger this from within ChannelManager. Technically a
                                // user could use this object with some proxying in between which makes this
                                // possible, but in tests and fuzzing, this should be a panic.
-                               #[cfg(any(test, feature = "fuzztarget"))]
+                               #[cfg(any(test, fuzzing))]
                                panic!("ChannelManager generated a channel update for a channel that was not yet registered!");
-                               #[cfg(not(any(test, feature = "fuzztarget")))]
+                               #[cfg(not(any(test, fuzzing)))]
                                Err(ChannelMonitorUpdateErr::PermanentFailure)
                        },
                        Some(monitor_state) => {
                                let monitor = &monitor_state.monitor;
-                               log_trace!(self.logger, "Updating Channel Monitor for channel {}", log_funding_info!(monitor));
+                               log_trace!(self.logger, "Updating ChannelMonitor for channel {}", log_funding_info!(monitor));
                                let update_res = monitor.update_monitor(&update, &self.broadcaster, &self.fee_estimator, &self.logger);
-                               if let Err(e) = &update_res {
-                                       log_error!(self.logger, "Failed to update channel monitor: {:?}", e);
+                               if update_res.is_err() {
+                                       log_error!(self.logger, "Failed to update ChannelMonitor for channel {}.", log_funding_info!(monitor));
                                }
                                // Even if updating the monitor returns an error, the monitor's state will
                                // still be changed. So, persist the updated monitor despite the error.
-                               let persist_res = self.persister.update_persisted_channel(funding_txo, &update, monitor);
-                               if let Err(ref e) = persist_res {
-                                       log_error!(self.logger, "Failed to persist channel monitor update: {:?}", e);
+                               let update_id = MonitorUpdateId::from_monitor_update(&update);
+                               let mut pending_monitor_updates = monitor_state.pending_monitor_updates.lock().unwrap();
+                               let persist_res = self.persister.update_persisted_channel(funding_txo, &Some(update), monitor, update_id);
+                               if let Err(e) = persist_res {
+                                       if e == ChannelMonitorUpdateErr::TemporaryFailure {
+                                               pending_monitor_updates.push(update_id);
+                                       } else {
+                                               monitor_state.channel_perm_failed.store(true, Ordering::Release);
+                                       }
+                                       log_error!(self.logger, "Failed to persist ChannelMonitor update for channel {}: {:?}", log_funding_info!(monitor), e);
+                               } else {
+                                       log_trace!(self.logger, "Finished persisting ChannelMonitor update for channel {}", log_funding_info!(monitor));
                                }
                                if update_res.is_err() {
                                        Err(ChannelMonitorUpdateErr::PermanentFailure)
+                               } else if monitor_state.channel_perm_failed.load(Ordering::Acquire) {
+                                       Err(ChannelMonitorUpdateErr::PermanentFailure)
                                } else {
                                        persist_res
                                }
@@ -458,7 +669,31 @@ where C::Target: chain::Filter,
        fn release_pending_monitor_events(&self) -> Vec<MonitorEvent> {
                let mut pending_monitor_events = self.pending_monitor_events.lock().unwrap().split_off(0);
                for monitor_state in self.monitors.read().unwrap().values() {
-                       pending_monitor_events.append(&mut monitor_state.monitor.get_and_clear_pending_monitor_events());
+                       let is_pending_monitor_update = monitor_state.has_pending_chainsync_updates(&monitor_state.pending_monitor_updates.lock().unwrap());
+                       if is_pending_monitor_update &&
+                                       monitor_state.last_chain_persist_height.load(Ordering::Acquire) + LATENCY_GRACE_PERIOD_BLOCKS as usize
+                                               > self.highest_chain_height.load(Ordering::Acquire)
+                       {
+                               log_info!(self.logger, "A Channel Monitor sync is still in progress, refusing to provide monitor events!");
+                       } else {
+                               if monitor_state.channel_perm_failed.load(Ordering::Acquire) {
+                                       // If a `UpdateOrigin::ChainSync` persistence failed with `PermanantFailure`,
+                                       // we don't really know if the latest `ChannelMonitor` state is on disk or not.
+                                       // We're supposed to hold monitor updates until the latest state is on disk to
+                                       // avoid duplicate events, but the user told us persistence is screw-y and may
+                                       // not complete. We can't hold events forever because we may learn some payment
+                                       // preimage, so instead we just log and hope the user complied with the
+                                       // `PermanentFailure` requirements of having at least the local-disk copy
+                                       // updated.
+                                       log_info!(self.logger, "A Channel Monitor sync returned PermanentFailure. Returning monitor events but duplicate events may appear after reload!");
+                               }
+                               if is_pending_monitor_update {
+                                       log_error!(self.logger, "A ChannelMonitor sync took longer than {} blocks to complete.", LATENCY_GRACE_PERIOD_BLOCKS);
+                                       log_error!(self.logger, "   To avoid funds-loss, we are allowing monitor updates to be released.");
+                                       log_error!(self.logger, "   This may cause duplicate payment events to be generated.");
+                               }
+                               pending_monitor_events.append(&mut monitor_state.monitor.get_and_clear_pending_monitor_events());
+                       }
                }
                pending_monitor_events
        }
@@ -490,10 +725,18 @@ impl<ChannelSigner: Sign, C: Deref, T: Deref, F: Deref, L: Deref, P: Deref> even
 
 #[cfg(test)]
 mod tests {
-       use ::{check_added_monitors, get_local_commitment_txn};
+       use bitcoin::BlockHeader;
+       use ::{check_added_monitors, check_closed_broadcast, check_closed_event};
+       use ::{expect_payment_sent, expect_payment_sent_without_paths, expect_payment_path_successful, get_event_msg};
+       use ::{get_htlc_update_msgs, get_local_commitment_txn, get_revoke_commit_msgs, get_route_and_payment_hash, unwrap_send_err};
+       use chain::{ChannelMonitorUpdateErr, Confirm, Watch};
+       use chain::channelmonitor::LATENCY_GRACE_PERIOD_BLOCKS;
+       use ln::channelmanager::PaymentSendFailure;
        use ln::features::InitFeatures;
        use ln::functional_test_utils::*;
-       use util::events::MessageSendEventsProvider;
+       use ln::msgs::ChannelMessageHandler;
+       use util::errors::APIError;
+       use util::events::{ClosureReason, MessageSendEvent, MessageSendEventsProvider};
        use util::test_utils::{OnRegisterOutput, TxOutReference};
 
        /// Tests that in-block dependent transactions are processed by `block_connected` when not
@@ -538,4 +781,180 @@ mod tests {
                nodes[1].node.get_and_clear_pending_msg_events();
                nodes[1].node.get_and_clear_pending_events();
        }
+
+       #[test]
+       fn test_async_ooo_offchain_updates() {
+               // Test that if we have multiple offchain updates being persisted and they complete
+               // out-of-order, the ChainMonitor waits until all have completed before informing the
+               // ChannelManager.
+               let chanmon_cfgs = create_chanmon_cfgs(2);
+               let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
+               let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
+               let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+               create_announced_chan_between_nodes(&nodes, 0, 1, InitFeatures::known(), InitFeatures::known());
+
+               // Route two payments to be claimed at the same time.
+               let payment_preimage_1 = route_payment(&nodes[0], &[&nodes[1]], 1_000_000).0;
+               let payment_preimage_2 = route_payment(&nodes[0], &[&nodes[1]], 1_000_000).0;
+
+               chanmon_cfgs[1].persister.offchain_monitor_updates.lock().unwrap().clear();
+               chanmon_cfgs[1].persister.set_update_ret(Err(ChannelMonitorUpdateErr::TemporaryFailure));
+
+               nodes[1].node.claim_funds(payment_preimage_1);
+               check_added_monitors!(nodes[1], 1);
+               nodes[1].node.claim_funds(payment_preimage_2);
+               check_added_monitors!(nodes[1], 1);
+
+               chanmon_cfgs[1].persister.set_update_ret(Ok(()));
+
+               let persistences = chanmon_cfgs[1].persister.offchain_monitor_updates.lock().unwrap().clone();
+               assert_eq!(persistences.len(), 1);
+               let (funding_txo, updates) = persistences.iter().next().unwrap();
+               assert_eq!(updates.len(), 2);
+
+               // Note that updates is a HashMap so the ordering here is actually random. This shouldn't
+               // fail either way but if it fails intermittently it's depending on the ordering of updates.
+               let mut update_iter = updates.iter();
+               nodes[1].chain_monitor.chain_monitor.channel_monitor_updated(*funding_txo, update_iter.next().unwrap().clone()).unwrap();
+               assert!(nodes[1].chain_monitor.release_pending_monitor_events().is_empty());
+               assert!(nodes[1].node.get_and_clear_pending_msg_events().is_empty());
+               nodes[1].chain_monitor.chain_monitor.channel_monitor_updated(*funding_txo, update_iter.next().unwrap().clone()).unwrap();
+
+               // Now manually walk the commitment signed dance - because we claimed two payments
+               // back-to-back it doesn't fit into the neat walk commitment_signed_dance does.
+
+               let updates = get_htlc_update_msgs!(nodes[1], nodes[0].node.get_our_node_id());
+               nodes[0].node.handle_update_fulfill_htlc(&nodes[1].node.get_our_node_id(), &updates.update_fulfill_htlcs[0]);
+               expect_payment_sent_without_paths!(nodes[0], payment_preimage_1);
+               nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &updates.commitment_signed);
+               check_added_monitors!(nodes[0], 1);
+               let (as_first_raa, as_first_update) = get_revoke_commit_msgs!(nodes[0], nodes[1].node.get_our_node_id());
+
+               nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &as_first_raa);
+               check_added_monitors!(nodes[1], 1);
+               let bs_second_updates = get_htlc_update_msgs!(nodes[1], nodes[0].node.get_our_node_id());
+               nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &as_first_update);
+               check_added_monitors!(nodes[1], 1);
+               let bs_first_raa = get_event_msg!(nodes[1], MessageSendEvent::SendRevokeAndACK, nodes[0].node.get_our_node_id());
+
+               nodes[0].node.handle_update_fulfill_htlc(&nodes[1].node.get_our_node_id(), &bs_second_updates.update_fulfill_htlcs[0]);
+               expect_payment_sent_without_paths!(nodes[0], payment_preimage_2);
+               nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bs_second_updates.commitment_signed);
+               check_added_monitors!(nodes[0], 1);
+               nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bs_first_raa);
+               expect_payment_path_successful!(nodes[0]);
+               check_added_monitors!(nodes[0], 1);
+               let (as_second_raa, as_second_update) = get_revoke_commit_msgs!(nodes[0], nodes[1].node.get_our_node_id());
+
+               nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &as_second_raa);
+               check_added_monitors!(nodes[1], 1);
+               nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &as_second_update);
+               check_added_monitors!(nodes[1], 1);
+               let bs_second_raa = get_event_msg!(nodes[1], MessageSendEvent::SendRevokeAndACK, nodes[0].node.get_our_node_id());
+
+               nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bs_second_raa);
+               expect_payment_path_successful!(nodes[0]);
+               check_added_monitors!(nodes[0], 1);
+       }
+
+       fn do_chainsync_pauses_events(block_timeout: bool) {
+               // When a chainsync monitor update occurs, any MonitorUpdates should be held before being
+               // passed upstream to a `ChannelManager` via `Watch::release_pending_monitor_events`. This
+               // tests that behavior, as well as some ways it might go wrong.
+               let chanmon_cfgs = create_chanmon_cfgs(2);
+               let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
+               let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
+               let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+               let channel = create_announced_chan_between_nodes(
+                       &nodes, 0, 1, InitFeatures::known(), InitFeatures::known());
+
+               // Get a route for later and rebalance the channel somewhat
+               send_payment(&nodes[0], &[&nodes[1]], 10_000_000);
+               let (route, second_payment_hash, _, second_payment_secret) = get_route_and_payment_hash!(nodes[0], nodes[1], 100_000);
+
+               // First route a payment that we will claim on chain and give the recipient the preimage.
+               let payment_preimage = route_payment(&nodes[0], &[&nodes[1]], 1_000_000).0;
+               nodes[1].node.claim_funds(payment_preimage);
+               nodes[1].node.get_and_clear_pending_msg_events();
+               check_added_monitors!(nodes[1], 1);
+               let remote_txn = get_local_commitment_txn!(nodes[1], channel.2);
+               assert_eq!(remote_txn.len(), 2);
+
+               // Temp-fail the block connection which will hold the channel-closed event
+               chanmon_cfgs[0].persister.chain_sync_monitor_persistences.lock().unwrap().clear();
+               chanmon_cfgs[0].persister.set_update_ret(Err(ChannelMonitorUpdateErr::TemporaryFailure));
+
+               // Connect B's commitment transaction, but only to the ChainMonitor/ChannelMonitor. The
+               // channel is now closed, but the ChannelManager doesn't know that yet.
+               let new_header = BlockHeader {
+                       version: 2, time: 0, bits: 0, nonce: 0,
+                       prev_blockhash: nodes[0].best_block_info().0,
+                       merkle_root: Default::default() };
+               nodes[0].chain_monitor.chain_monitor.transactions_confirmed(&new_header,
+                       &[(0, &remote_txn[0]), (1, &remote_txn[1])], nodes[0].best_block_info().1 + 1);
+               assert!(nodes[0].chain_monitor.release_pending_monitor_events().is_empty());
+               nodes[0].chain_monitor.chain_monitor.best_block_updated(&new_header, nodes[0].best_block_info().1 + 1);
+               assert!(nodes[0].chain_monitor.release_pending_monitor_events().is_empty());
+
+               // If the ChannelManager tries to update the channel, however, the ChainMonitor will pass
+               // the update through to the ChannelMonitor which will refuse it (as the channel is closed).
+               chanmon_cfgs[0].persister.set_update_ret(Ok(()));
+               unwrap_send_err!(nodes[0].node.send_payment(&route, second_payment_hash, &Some(second_payment_secret)),
+                       true, APIError::ChannelUnavailable { ref err },
+                       assert!(err.contains("ChannelMonitor storage failure")));
+               check_added_monitors!(nodes[0], 2); // After the failure we generate a close-channel monitor update
+               check_closed_broadcast!(nodes[0], true);
+               check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() });
+
+               // However, as the ChainMonitor is still waiting for the original persistence to complete,
+               // it won't yet release the MonitorEvents.
+               assert!(nodes[0].chain_monitor.release_pending_monitor_events().is_empty());
+
+               if block_timeout {
+                       // After three blocks, pending MontiorEvents should be released either way.
+                       let latest_header = BlockHeader {
+                               version: 2, time: 0, bits: 0, nonce: 0,
+                               prev_blockhash: nodes[0].best_block_info().0,
+                               merkle_root: Default::default() };
+                       nodes[0].chain_monitor.chain_monitor.best_block_updated(&latest_header, nodes[0].best_block_info().1 + LATENCY_GRACE_PERIOD_BLOCKS);
+               } else {
+                       let persistences = chanmon_cfgs[0].persister.chain_sync_monitor_persistences.lock().unwrap().clone();
+                       for (funding_outpoint, update_ids) in persistences {
+                               for update_id in update_ids {
+                                       nodes[0].chain_monitor.chain_monitor.channel_monitor_updated(funding_outpoint, update_id).unwrap();
+                               }
+                       }
+               }
+
+               expect_payment_sent!(nodes[0], payment_preimage);
+       }
+
+       #[test]
+       fn chainsync_pauses_events() {
+               do_chainsync_pauses_events(false);
+               do_chainsync_pauses_events(true);
+       }
+
+       #[test]
+       fn update_during_chainsync_fails_channel() {
+               let chanmon_cfgs = create_chanmon_cfgs(2);
+               let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
+               let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
+               let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+               create_announced_chan_between_nodes(&nodes, 0, 1, InitFeatures::known(), InitFeatures::known());
+
+               chanmon_cfgs[0].persister.chain_sync_monitor_persistences.lock().unwrap().clear();
+               chanmon_cfgs[0].persister.set_update_ret(Err(ChannelMonitorUpdateErr::PermanentFailure));
+
+               connect_blocks(&nodes[0], 1);
+               // Before processing events, the ChannelManager will still think the Channel is open and
+               // there won't be any ChannelMonitorUpdates
+               assert_eq!(nodes[0].node.list_channels().len(), 1);
+               check_added_monitors!(nodes[0], 0);
+               // ... however once we get events once, the channel will close, creating a channel-closed
+               // ChannelMonitorUpdate.
+               check_closed_broadcast!(nodes[0], true);
+               check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "Failed to persist ChannelMonitor update during chain sync".to_string() });
+               check_added_monitors!(nodes[0], 1);
+       }
 }