Merge pull request #213 from TheBlueMatt/2018-10-monitor-fail-pause

Add ChannelManager support for monitor update failure in one place
2025-02-25 07:17:40 +01:00 · 2018-10-23 16:45:50 -04:00 · 2018-10-23 16:45:50 -04:00 · 3bcd911fcb
commit 3bcd911fcb
parent 65b23d8d8a 497643a65b
8 changed files with 984 additions and 217 deletions
--- a/src/ln/channel.rs
+++ b/src/ln/channel.rs
@ -248,28 +248,32 @@ enum ChannelState {
 	/// "disconnected" and no updates are allowed until after we've done a channel_reestablish
 	/// dance.
 	PeerDisconnected = (1 << 7),
+	/// Flag which is set on ChannelFunded and FundingSent indicating the user has told us they
+	/// failed to update our ChannelMonitor somewhere and we should pause sending any outbound
+	/// messages until they've managed to do so.
+	MonitorUpdateFailed = (1 << 8),
 	/// Flag which implies that we have sent a commitment_signed but are awaiting the responding
 	/// revoke_and_ack message. During this time period, we can't generate new commitment_signed
 	/// messages as then we will be unable to determine which HTLCs they included in their
 	/// revoke_and_ack implicit ACK, so instead we have to hold them away temporarily to be sent
 	/// later.
 	/// Flag is set on ChannelFunded.
-	AwaitingRemoteRevoke = (1 << 8),
+	AwaitingRemoteRevoke = (1 << 9),
 	/// Flag which is set on ChannelFunded or FundingSent after receiving a shutdown message from
 	/// the remote end. If set, they may not add any new HTLCs to the channel, and we are expected
 	/// to respond with our own shutdown message when possible.
-	RemoteShutdownSent = (1 << 9),
+	RemoteShutdownSent = (1 << 10),
 	/// Flag which is set on ChannelFunded or FundingSent after sending a shutdown message. At this
 	/// point, we may not add any new HTLCs to the channel.
 	/// TODO: Investigate some kind of timeout mechanism by which point the remote end must provide
 	/// us their shutdown.
-	LocalShutdownSent = (1 << 10),
+	LocalShutdownSent = (1 << 11),
 	/// We've successfully negotiated a closing_signed dance. At this point ChannelManager is about
 	/// to drop us, but we store this anyway.
-	ShutdownComplete = 2048,
+	ShutdownComplete = 4096,
 }
 const BOTH_SIDES_SHUTDOWN_MASK: u32 = (ChannelState::LocalShutdownSent as u32 | ChannelState::RemoteShutdownSent as u32);
-const MULTI_STATE_FLAGS: u32 = (BOTH_SIDES_SHUTDOWN_MASK | ChannelState::PeerDisconnected as u32);
+const MULTI_STATE_FLAGS: u32 = (BOTH_SIDES_SHUTDOWN_MASK | ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32);

 const INITIAL_COMMITMENT_NUMBER: u64 = (1 << 48) - 1;

@ -306,6 +310,12 @@ pub(super) struct Channel {
 	pending_outbound_htlcs: Vec<OutboundHTLCOutput>,
 	holding_cell_htlc_updates: Vec<HTLCUpdateAwaitingACK>,

+	monitor_pending_revoke_and_ack: bool,
+	monitor_pending_commitment_signed: bool,
+	monitor_pending_order: Option<RAACommitmentOrder>,
+	monitor_pending_forwards: Vec<(PendingForwardHTLCInfo, u64)>,
+	monitor_pending_failures: Vec<(HTLCSource, [u8; 32], HTLCFailReason)>,
+
 	// pending_update_fee is filled when sending and receiving update_fee
 	// For outbound channel, feerate_per_kw is updated with the value from
 	// pending_update_fee when revoke_and_ack is received
@ -509,6 +519,12 @@ impl Channel {
 			next_remote_htlc_id: 0,
 			channel_update_count: 1,

+			monitor_pending_revoke_and_ack: false,
+			monitor_pending_commitment_signed: false,
+			monitor_pending_order: None,
+			monitor_pending_forwards: Vec::new(),
+			monitor_pending_failures: Vec::new(),
+
 			#[cfg(debug_assertions)]
 			max_commitment_tx_output_local: ::std::sync::Mutex::new((channel_value_satoshis * 1000 - push_msat, push_msat)),
 			#[cfg(debug_assertions)]
@ -666,6 +682,12 @@ impl Channel {
 			next_remote_htlc_id: 0,
 			channel_update_count: 1,

+			monitor_pending_revoke_and_ack: false,
+			monitor_pending_commitment_signed: false,
+			monitor_pending_order: None,
+			monitor_pending_forwards: Vec::new(),
+			monitor_pending_failures: Vec::new(),
+
 			#[cfg(debug_assertions)]
 			max_commitment_tx_output_local: ::std::sync::Mutex::new((msg.push_msat, msg.funding_satoshis * 1000 - msg.push_msat)),
 			#[cfg(debug_assertions)]
@ -1166,7 +1188,7 @@ impl Channel {
 		// can claim it even if the channel hits the chain before we see their next commitment.
 		self.channel_monitor.provide_payment_preimage(&payment_hash_calc, &payment_preimage_arg);

-		if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::PeerDisconnected as u32)) != 0 {
+		if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32)) != 0 {
 			for pending_update in self.holding_cell_htlc_updates.iter() {
 				match pending_update {
 					&HTLCUpdateAwaitingACK::ClaimHTLC { htlc_id, .. } => {
@ -1243,7 +1265,7 @@ impl Channel {
 		}

 		// Now update local state:
-		if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::PeerDisconnected as u32)) != 0 {
+		if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32)) != 0 {
 			for pending_update in self.holding_cell_htlc_updates.iter() {
 				match pending_update {
 					&HTLCUpdateAwaitingACK::ClaimHTLC { htlc_id, .. } => {
@ -1461,11 +1483,13 @@ impl Channel {
 		if self.channel_state & (ChannelState::PeerDisconnected as u32) == ChannelState::PeerDisconnected as u32 {
 			return Err(ChannelError::Close("Peer sent funding_locked when we needed a channel_reestablish"));
 		}
-		let non_shutdown_state = self.channel_state & (!BOTH_SIDES_SHUTDOWN_MASK);
+
+		let non_shutdown_state = self.channel_state & (!MULTI_STATE_FLAGS);
+
 		if non_shutdown_state == ChannelState::FundingSent as u32 {
 			self.channel_state |= ChannelState::TheirFundingLocked as u32;
 		} else if non_shutdown_state == (ChannelState::FundingSent as u32 | ChannelState::OurFundingLocked as u32) {
-			self.channel_state = ChannelState::ChannelFunded as u32 | (self.channel_state & BOTH_SIDES_SHUTDOWN_MASK);
+			self.channel_state = ChannelState::ChannelFunded as u32 | (self.channel_state & MULTI_STATE_FLAGS);
 			self.channel_update_count += 1;
 		} else if self.channel_state & (ChannelState::ChannelFunded as u32) != 0 &&
 				// Note that funding_signed/funding_created will have decremented both by 1!
@ -1685,6 +1709,11 @@ impl Channel {
 				}
 			}
 		}
+		if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) == 0 {
+			// This is a response to our post-monitor-failed unfreeze messages, so we can clear the
+			// monitor_pending_order requirement as we won't re-send the monitor_pending messages.
+			self.monitor_pending_order = None;
+		}

 		self.channel_monitor.provide_latest_local_commitment_tx_info(local_commitment_tx.0, local_keys, self.feerate_per_kw, htlcs_and_sigs);

@ -1708,6 +1737,12 @@ impl Channel {
 		self.last_local_commitment_txn = new_local_commitment_txn;
 		self.received_commitment_while_awaiting_raa = (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32)) != 0;

+		if (self.channel_state & ChannelState::MonitorUpdateFailed as u32) != 0 {
+			self.monitor_pending_revoke_and_ack = true;
+			self.monitor_pending_commitment_signed |= need_our_commitment;
+			return Err(HandleError{err: "Previous monitor update failure prevented generation of RAA", action: Some(ErrorAction::IgnoreError)});
+		}
+
 		let (our_commitment_signed, monitor_update) = if need_our_commitment && (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32)) == 0 {
 			// If we're AwaitingRemoteRevoke we can't send a new commitment here, but that's ok -
 			// we'll send one right away when we get the revoke_and_ack when we
@ -1726,6 +1761,7 @@ impl Channel {
 	/// Used to fulfill holding_cell_htlcs when we get a remote ack (or implicitly get it by them
 	/// fulfilling or failing the last pending HTLC)
 	fn free_holding_cell_htlcs(&mut self) -> Result<Option<(msgs::CommitmentUpdate, ChannelMonitor)>, HandleError> {
+		assert_eq!(self.channel_state & ChannelState::MonitorUpdateFailed as u32, 0);
 		if self.holding_cell_htlc_updates.len() != 0 || self.holding_cell_update_fee.is_some() {
 			let mut htlc_updates = Vec::new();
 			mem::swap(&mut htlc_updates, &mut self.holding_cell_htlc_updates);
@ -1827,6 +1863,7 @@ impl Channel {
 		if self.channel_state & (ChannelState::PeerDisconnected as u32) == ChannelState::PeerDisconnected as u32 {
 			return Err(HandleError{err: "Peer sent revoke_and_ack when we needed a channel_reestablish", action: Some(msgs::ErrorAction::SendErrorMessage{msg: msgs::ErrorMessage{data: "Peer sent revoke_and_ack when we needed a channel_reestablish".to_string(), channel_id: msg.channel_id}})});
 		}
+
 		if let Some(their_prev_commitment_point) = self.their_prev_commitment_point {
 			if PublicKey::from_secret_key(&self.secp_ctx, &secp_call!(SecretKey::from_slice(&self.secp_ctx, &msg.per_commitment_secret), "Peer provided an invalid per_commitment_secret", self.channel_id())) != their_prev_commitment_point {
 				return Err(HandleError{err: "Got a revoke commitment secret which didn't correspond to their current pubkey", action: None});
@ -1843,6 +1880,11 @@ impl Channel {
 		self.their_cur_commitment_point = Some(msg.next_per_commitment_point);
 		self.cur_remote_commitment_transaction_number -= 1;
 		self.received_commitment_while_awaiting_raa = false;
+		if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) == 0 {
+			// This is a response to our post-monitor-failed unfreeze messages, so we can clear the
+			// monitor_pending_order requirement as we won't re-send the monitor_pending messages.
+			self.monitor_pending_order = None;
+		}

 		let mut to_forward_infos = Vec::new();
 		let mut revoked_htlcs = Vec::new();
@ -1934,6 +1976,17 @@ impl Channel {
 			}
 		}

+		if (self.channel_state & ChannelState::MonitorUpdateFailed as u32) == ChannelState::MonitorUpdateFailed as u32 {
+			// We can't actually generate a new commitment transaction (incl by freeing holding
+			// cells) while we can't update the monitor, so we just return what we have.
+			if require_commitment {
+				self.monitor_pending_commitment_signed = true;
+			}
+			self.monitor_pending_forwards.append(&mut to_forward_infos);
+			self.monitor_pending_failures.append(&mut revoked_htlcs);
+			return Ok((None, Vec::new(), Vec::new(), self.channel_monitor.clone()));
+		}
+
 		match self.free_holding_cell_htlcs()? {
 			Some(mut commitment_update) => {
 				commitment_update.0.update_fail_htlcs.reserve(update_fail_htlcs.len());
@ -1976,7 +2029,7 @@ impl Channel {
 			panic!("Cannot update fee until channel is fully established and we haven't started shutting down");
 		}
 		if !self.is_live() {
-			panic!("Cannot update fee while peer is disconnected (ChannelManager should have caught this)");
+			panic!("Cannot update fee while peer is disconnected/we're awaiting a monitor update (ChannelManager should have caught this)");
 		}

 		if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32)) == (ChannelState::AwaitingRemoteRevoke as u32) {
@ -2068,6 +2121,60 @@ impl Channel {
 		outbound_drops
 	}

+	/// Indicates that a ChannelMonitor update failed to be stored by the client and further
+	/// updates are partially paused.
+	/// This must be called immediately after the call which generated the ChannelMonitor update
+	/// which failed, with the order argument set to the type of call it represented (ie a
+	/// commitment update or a revoke_and_ack generation). The messages which were generated from
+	/// that original call must *not* have been sent to the remote end, and must instead have been
+	/// dropped. They will be regenerated when monitor_updating_restored is called.
+	pub fn monitor_update_failed(&mut self, order: RAACommitmentOrder) {
+		assert_eq!(self.channel_state & ChannelState::MonitorUpdateFailed as u32, 0);
+		match order {
+			RAACommitmentOrder::CommitmentFirst => {
+				self.monitor_pending_revoke_and_ack = false;
+				self.monitor_pending_commitment_signed = true;
+			},
+			RAACommitmentOrder::RevokeAndACKFirst => {
+				self.monitor_pending_revoke_and_ack = true;
+				self.monitor_pending_commitment_signed = false;
+			},
+		}
+		self.monitor_pending_order = Some(order);
+		self.channel_state |= ChannelState::MonitorUpdateFailed as u32;
+	}
+
+	/// Indicates that the latest ChannelMonitor update has been committed by the client
+	/// successfully and we should restore normal operation. Returns messages which should be sent
+	/// to the remote side.
+	pub fn monitor_updating_restored(&mut self) -> (Option<msgs::RevokeAndACK>, Option<msgs::CommitmentUpdate>, RAACommitmentOrder, Vec<(PendingForwardHTLCInfo, u64)>, Vec<(HTLCSource, [u8; 32], HTLCFailReason)>) {
+		assert_eq!(self.channel_state & ChannelState::MonitorUpdateFailed as u32, ChannelState::MonitorUpdateFailed as u32);
+		self.channel_state &= !(ChannelState::MonitorUpdateFailed as u32);
+
+		let mut forwards = Vec::new();
+		mem::swap(&mut forwards, &mut self.monitor_pending_forwards);
+		let mut failures = Vec::new();
+		mem::swap(&mut failures, &mut self.monitor_pending_failures);
+
+		if self.channel_state & (ChannelState::PeerDisconnected as u32) != 0 {
+			// Leave monitor_pending_order so we can order our channel_reestablish responses
+			self.monitor_pending_revoke_and_ack = false;
+			self.monitor_pending_commitment_signed = false;
+			return (None, None, RAACommitmentOrder::RevokeAndACKFirst, forwards, failures);
+		}
+
+		let raa = if self.monitor_pending_revoke_and_ack {
+			Some(self.get_last_revoke_and_ack())
+		} else { None };
+		let commitment_update = if self.monitor_pending_commitment_signed {
+			Some(self.get_last_commitment_update())
+		} else { None };
+
+		self.monitor_pending_revoke_and_ack = false;
+		self.monitor_pending_commitment_signed = false;
+		(raa, commitment_update, self.monitor_pending_order.clone().unwrap(), forwards, failures)
+	}
+
 	pub fn update_fee(&mut self, fee_estimator: &FeeEstimator, msg: &msgs::UpdateFee) -> Result<(), ChannelError> {
 		if self.channel_outbound {
 			return Err(ChannelError::Close("Non-funding remote tried to update channel fee"));
@ -2082,6 +2189,71 @@ impl Channel {
 		Ok(())
 	}

+	fn get_last_revoke_and_ack(&self) -> msgs::RevokeAndACK {
+		let next_per_commitment_point = PublicKey::from_secret_key(&self.secp_ctx, &self.build_local_commitment_secret(self.cur_local_commitment_transaction_number));
+		let per_commitment_secret = chan_utils::build_commitment_secret(self.local_keys.commitment_seed, self.cur_local_commitment_transaction_number + 2);
+		msgs::RevokeAndACK {
+			channel_id: self.channel_id,
+			per_commitment_secret,
+			next_per_commitment_point,
+		}
+	}
+
+	fn get_last_commitment_update(&self) -> msgs::CommitmentUpdate {
+		let mut update_add_htlcs = Vec::new();
+		let mut update_fulfill_htlcs = Vec::new();
+		let mut update_fail_htlcs = Vec::new();
+		let mut update_fail_malformed_htlcs = Vec::new();
+
+		for htlc in self.pending_outbound_htlcs.iter() {
+			if let &OutboundHTLCState::LocalAnnounced(ref onion_packet) = &htlc.state {
+				update_add_htlcs.push(msgs::UpdateAddHTLC {
+					channel_id: self.channel_id(),
+					htlc_id: htlc.htlc_id,
+					amount_msat: htlc.amount_msat,
+					payment_hash: htlc.payment_hash,
+					cltv_expiry: htlc.cltv_expiry,
+					onion_routing_packet: (**onion_packet).clone(),
+				});
+			}
+		}
+
+		for htlc in self.pending_inbound_htlcs.iter() {
+			if let &InboundHTLCState::LocalRemoved(ref reason) = &htlc.state {
+				match reason {
+					&InboundHTLCRemovalReason::FailRelay(ref err_packet) => {
+						update_fail_htlcs.push(msgs::UpdateFailHTLC {
+							channel_id: self.channel_id(),
+							htlc_id: htlc.htlc_id,
+							reason: err_packet.clone()
+						});
+					},
+					&InboundHTLCRemovalReason::FailMalformed((ref sha256_of_onion, ref failure_code)) => {
+						update_fail_malformed_htlcs.push(msgs::UpdateFailMalformedHTLC {
+							channel_id: self.channel_id(),
+							htlc_id: htlc.htlc_id,
+							sha256_of_onion: sha256_of_onion.clone(),
+							failure_code: failure_code.clone(),
+						});
+					},
+					&InboundHTLCRemovalReason::Fulfill(ref payment_preimage) => {
+						update_fulfill_htlcs.push(msgs::UpdateFulfillHTLC {
+							channel_id: self.channel_id(),
+							htlc_id: htlc.htlc_id,
+							payment_preimage: payment_preimage.clone(),
+						});
+					},
+				}
+			}
+		}
+
+		msgs::CommitmentUpdate {
+			update_add_htlcs, update_fulfill_htlcs, update_fail_htlcs, update_fail_malformed_htlcs,
+			update_fee: None, //TODO: We need to support re-generating any update_fees in the last commitment_signed!
+			commitment_signed: self.send_commitment_no_state_update().expect("It looks like we failed to re-generate a commitment_signed we had previously sent?").0,
+		}
+	}
+
 	/// May panic if some calls other than message-handling calls (which will all Err immediately)
 	/// have been called between remove_uncommitted_htlcs_and_mark_paused and this call.
 	pub fn channel_reestablish(&mut self, msg: &msgs::ChannelReestablish) -> Result<(Option<msgs::FundingLocked>, Option<msgs::RevokeAndACK>, Option<msgs::CommitmentUpdate>, Option<ChannelMonitor>, RAACommitmentOrder), ChannelError> {
@ -2106,13 +2278,12 @@ impl Channel {
 			// Note that if we need to repeat our FundingLocked we'll do that in the next if block.
 			None
 		} else if msg.next_remote_commitment_number == (INITIAL_COMMITMENT_NUMBER - 1) - self.cur_local_commitment_transaction_number {
-			let next_per_commitment_point = PublicKey::from_secret_key(&self.secp_ctx, &self.build_local_commitment_secret(self.cur_local_commitment_transaction_number));
-			let per_commitment_secret = chan_utils::build_commitment_secret(self.local_keys.commitment_seed, self.cur_local_commitment_transaction_number + 2);
-			Some(msgs::RevokeAndACK {
-				channel_id: self.channel_id,
-				per_commitment_secret,
-				next_per_commitment_point,
-			})
+			if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) != 0 {
+				self.monitor_pending_revoke_and_ack = true;
+				None
+			} else {
+				Some(self.get_last_revoke_and_ack())
+			}
 		} else {
 			return Err(ChannelError::Close("Peer attempted to reestablish channel with a very old local commitment transaction"));
 		};
@ -2124,6 +2295,7 @@ impl Channel {
 		let our_next_remote_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_remote_commitment_transaction_number + if (self.channel_state & ChannelState::AwaitingRemoteRevoke as u32) != 0 { 1 } else { 0 };

 		let resend_funding_locked = if msg.next_local_commitment_number == 1 && INITIAL_COMMITMENT_NUMBER - self.cur_local_commitment_transaction_number == 1 {
+			// We should never have to worry about MonitorUpdateFailed resending FundingLocked
 			let next_per_commitment_secret = self.build_local_commitment_secret(self.cur_local_commitment_transaction_number);
 			let next_per_commitment_point = PublicKey::from_secret_key(&self.secp_ctx, &next_per_commitment_secret);
 			Some(msgs::FundingLocked {
@ -2132,11 +2304,11 @@ impl Channel {
 			})
 		} else { None };

-		let order = if self.received_commitment_while_awaiting_raa {
-			RAACommitmentOrder::CommitmentFirst
-		} else {
-			RAACommitmentOrder::RevokeAndACKFirst
-		};
+		let order = self.monitor_pending_order.clone().unwrap_or(if self.received_commitment_while_awaiting_raa {
+				RAACommitmentOrder::CommitmentFirst
+			} else {
+				RAACommitmentOrder::RevokeAndACKFirst
+			});

 		if msg.next_local_commitment_number == our_next_remote_commitment_number {
 			if required_revoke.is_some() {
@ -2145,7 +2317,8 @@ impl Channel {
 				log_debug!(self, "Reconnected channel {} with no loss", log_bytes!(self.channel_id()));
 			}

-			if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32)) == 0 {
+			if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::MonitorUpdateFailed as u32)) == 0 &&
+					self.monitor_pending_order.is_none() { // monitor_pending_order indicates we're waiting on a response to a unfreeze
 				// We're up-to-date and not waiting on a remote revoke (if we are our
 				// channel_reestablish should result in them sending a revoke_and_ack), but we may
 				// have received some updates while we were disconnected. Free the holding cell
@ -2172,59 +2345,16 @@ impl Channel {
 			} else {
 				log_debug!(self, "Reconnected channel {} with only lost remote commitment tx", log_bytes!(self.channel_id()));
 			}
-			let mut update_add_htlcs = Vec::new();
-			let mut update_fulfill_htlcs = Vec::new();
-			let mut update_fail_htlcs = Vec::new();
-			let mut update_fail_malformed_htlcs = Vec::new();

-			for htlc in self.pending_outbound_htlcs.iter() {
-				if let &OutboundHTLCState::LocalAnnounced(ref onion_packet) = &htlc.state {
-					update_add_htlcs.push(msgs::UpdateAddHTLC {
-						channel_id: self.channel_id(),
-						htlc_id: htlc.htlc_id,
-						amount_msat: htlc.amount_msat,
-						payment_hash: htlc.payment_hash,
-						cltv_expiry: htlc.cltv_expiry,
-						onion_routing_packet: (**onion_packet).clone(),
-					});
-				}
+			// If monitor_pending_order is set, it must be CommitmentSigned if we have no RAA
+			debug_assert!(self.monitor_pending_order != Some(RAACommitmentOrder::RevokeAndACKFirst) || required_revoke.is_some());
+
+			if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) != 0 {
+				self.monitor_pending_commitment_signed = true;
+				return Ok((resend_funding_locked, None, None, None, order));
 			}

-			for htlc in self.pending_inbound_htlcs.iter() {
-				if let &InboundHTLCState::LocalRemoved(ref reason) = &htlc.state {
-					match reason {
-						&InboundHTLCRemovalReason::FailRelay(ref err_packet) => {
-							update_fail_htlcs.push(msgs::UpdateFailHTLC {
-								channel_id: self.channel_id(),
-								htlc_id: htlc.htlc_id,
-								reason: err_packet.clone()
-							});
-						},
-						&InboundHTLCRemovalReason::FailMalformed((ref sha256_of_onion, ref failure_code)) => {
-							update_fail_malformed_htlcs.push(msgs::UpdateFailMalformedHTLC {
-								channel_id: self.channel_id(),
-								htlc_id: htlc.htlc_id,
-								sha256_of_onion: sha256_of_onion.clone(),
-								failure_code: failure_code.clone(),
-							});
-						},
-						&InboundHTLCRemovalReason::Fulfill(ref payment_preimage) => {
-							update_fulfill_htlcs.push(msgs::UpdateFulfillHTLC {
-								channel_id: self.channel_id(),
-								htlc_id: htlc.htlc_id,
-								payment_preimage: payment_preimage.clone(),
-							});
-						},
-					}
-				}
-			}
-
-			return Ok((resend_funding_locked, required_revoke,
-					Some(msgs::CommitmentUpdate {
-						update_add_htlcs, update_fulfill_htlcs, update_fail_htlcs, update_fail_malformed_htlcs,
-						update_fee: None, //TODO: We need to support re-generating any update_fees in the last commitment_signed!
-						commitment_signed: self.send_commitment_no_state_update().expect("It looks like we failed to re-generate a commitment_signed we had previously sent?").0,
-					}), None, order));
+			return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update()), None, order));
 		} else {
 			return Err(ChannelError::Close("Peer attempted to reestablish channel with a very old remote commitment transaction"));
 		}
@ -2561,7 +2691,13 @@ impl Channel {
 	/// is_usable() and considers things like the channel being temporarily disabled.
 	/// Allowed in any state (including after shutdown)
 	pub fn is_live(&self) -> bool {
-		self.is_usable() && (self.channel_state & (ChannelState::PeerDisconnected as u32) == 0)
+		self.is_usable() && (self.channel_state & (ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32) == 0)
+	}
+
+	/// Returns true if this channel has been marked as awaiting a monitor update to move forward.
+	/// Allowed in any state (including after shutdown)
+	pub fn is_awaiting_monitor_update(&self) -> bool {
+		(self.channel_state & ChannelState::MonitorUpdateFailed as u32) != 0
 	}

 	/// Returns true if funding_created was sent/received.
@ -2875,14 +3011,14 @@ impl Channel {
 			return Err(HandleError{err: "Cannot send less than their minimum HTLC value", action: None});
 		}

-		if (self.channel_state & (ChannelState::PeerDisconnected as u32)) == (ChannelState::PeerDisconnected as u32) {
+		if (self.channel_state & (ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32)) != 0 {
 			// Note that this should never really happen, if we're !is_live() on receipt of an
 			// incoming HTLC for relay will result in us rejecting the HTLC and we won't allow
 			// the user to send directly into a !is_live() channel. However, if we
 			// disconnected during the time the previous hop was doing the commitment dance we may
 			// end up getting here after the forwarding delay. In any case, returning an
 			// IgnoreError will get ChannelManager to do the right thing and fail backwards now.
-			return Err(HandleError{err: "Cannot send an HTLC while disconnected", action: Some(ErrorAction::IgnoreError)});
+			return Err(HandleError{err: "Cannot send an HTLC while disconnected/frozen for channel monitor update", action: Some(ErrorAction::IgnoreError)});
 		}

 		let (outbound_htlc_count, htlc_outbound_value_msat) = self.get_outbound_pending_htlc_stats();
@ -2964,6 +3100,9 @@ impl Channel {
 		if (self.channel_state & (ChannelState::PeerDisconnected as u32)) == (ChannelState::PeerDisconnected as u32) {
 			panic!("Cannot create commitment tx while disconnected, as send_htlc will have returned an Err so a send_commitment precondition has been violated");
 		}
+		if (self.channel_state & (ChannelState::MonitorUpdateFailed as u32)) == (ChannelState::PeerDisconnected as u32) {
+			panic!("Cannot create commitment tx while awaiting monitor update unfreeze, as send_htlc will have returned an Err so a send_commitment precondition has been violated");
+		}
 		let mut have_updates = self.pending_update_fee.is_some();
 		for htlc in self.pending_outbound_htlcs.iter() {
 			if let OutboundHTLCState::LocalAnnounced(_) = htlc.state {
@ -3072,8 +3211,8 @@ impl Channel {
 			}
 		}
 		assert_eq!(self.channel_state & ChannelState::ShutdownComplete as u32, 0);
-		if self.channel_state & (ChannelState::PeerDisconnected as u32) == ChannelState::PeerDisconnected as u32 {
-			return Err(APIError::ChannelUnavailable{err: "Cannot begin shutdown while peer is disconnected, maybe force-close instead?"});
+		if self.channel_state & (ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32) != 0 {
+			return Err(APIError::ChannelUnavailable{err: "Cannot begin shutdown while peer is disconnected or we're waiting on a monitor update, maybe force-close instead?"});
 		}

 		let our_closing_script = self.get_closing_scriptpubkey();
--- a/src/ln/channelmanager.rs
+++ b/src/ln/channelmanager.rs
--- a/src/ln/channelmonitor.rs
+++ b/src/ln/channelmonitor.rs
@ -39,6 +39,7 @@ use std::sync::{Arc,Mutex};
 use std::{hash,cmp};

 /// An error enum representing a failure to persist a channel monitor update.
+#[derive(Clone)]
 pub enum ChannelMonitorUpdateErr {
 	/// Used to indicate a temporary failure (eg connection to a watchtower failed, but is expected
 	/// to succeed at some point in the future).
@ -47,6 +48,22 @@ pub enum ChannelMonitorUpdateErr {
 	/// submitting new commitment transactions to the remote party.
 	/// ChannelManager::test_restore_channel_monitor can be used to retry the update(s) and restore
 	/// the channel to an operational state.
+	///
+	/// Note that continuing to operate when no copy of the updated ChannelMonitor could be
+	/// persisted is unsafe - if you failed to store the update on your own local disk you should
+	/// instead return PermanentFailure to force closure of the channel ASAP.
+	///
+	/// Even when a channel has been "frozen" updates to the ChannelMonitor can continue to occur
+	/// (eg if an inbound HTLC which we forwarded was claimed upstream resulting in us attempting
+	/// to claim it on this channel) and those updates must be applied wherever they can be. At
+	/// least one such updated ChannelMonitor must be persisted otherwise PermanentFailure should
+	/// be returned to get things on-chain ASAP using only the in-memory copy. Obviously updates to
+	/// the channel which would invalidate previous ChannelMonitors are not made when a channel has
+	/// been "frozen".
+	///
+	/// Note that even if updates made after TemporaryFailure succeed you must still call
+	/// test_restore_channel_monitor to ensure you have the latest monitor and re-enable normal
+	/// channel operation.
 	TemporaryFailure,
 	/// Used to indicate no further channel monitor updates will be allowed (eg we've moved on to a
 	/// different watchtower and cannot update with all watchtowers that were previously informed
--- a/src/ln/msgs.rs
+++ b/src/ln/msgs.rs
@ -224,7 +224,7 @@ pub struct FundingSigned {
 }

 /// A funding_locked message to be sent or received from a peer
-#[derive(Clone)]
+#[derive(Clone, PartialEq)]
 pub struct FundingLocked {
 	pub(crate) channel_id: [u8; 32],
 	pub(crate) next_per_commitment_point: PublicKey,
@ -244,7 +244,7 @@ pub struct ClosingSigned {
 }

 /// An update_add_htlc message to be sent or received from a peer
-#[derive(Clone)]
+#[derive(Clone, PartialEq)]
 pub struct UpdateAddHTLC {
 	pub(crate) channel_id: [u8; 32],
 	pub(crate) htlc_id: u64,
@ -255,7 +255,7 @@ pub struct UpdateAddHTLC {
 }

 /// An update_fulfill_htlc message to be sent or received from a peer
-#[derive(Clone)]
+#[derive(Clone, PartialEq)]
 pub struct UpdateFulfillHTLC {
 	pub(crate) channel_id: [u8; 32],
 	pub(crate) htlc_id: u64,
@ -263,7 +263,7 @@ pub struct UpdateFulfillHTLC {
 }

 /// An update_fail_htlc message to be sent or received from a peer
-#[derive(Clone)]
+#[derive(Clone, PartialEq)]
 pub struct UpdateFailHTLC {
 	pub(crate) channel_id: [u8; 32],
 	pub(crate) htlc_id: u64,
@ -271,7 +271,7 @@ pub struct UpdateFailHTLC {
 }

 /// An update_fail_malformed_htlc message to be sent or received from a peer
-#[derive(Clone)]
+#[derive(Clone, PartialEq)]
 pub struct UpdateFailMalformedHTLC {
 	pub(crate) channel_id: [u8; 32],
 	pub(crate) htlc_id: u64,
@ -280,7 +280,7 @@ pub struct UpdateFailMalformedHTLC {
 }

 /// A commitment_signed message to be sent or received from a peer
-#[derive(Clone)]
+#[derive(Clone, PartialEq)]
 pub struct CommitmentSigned {
 	pub(crate) channel_id: [u8; 32],
 	pub(crate) signature: Signature,
@ -288,6 +288,7 @@ pub struct CommitmentSigned {
 }

 /// A revoke_and_ack message to be sent or received from a peer
+#[derive(Clone, PartialEq)]
 pub struct RevokeAndACK {
 	pub(crate) channel_id: [u8; 32],
 	pub(crate) per_commitment_secret: [u8; 32],
@ -295,17 +296,20 @@ pub struct RevokeAndACK {
 }

 /// An update_fee message to be sent or received from a peer
+#[derive(PartialEq)]
 pub struct UpdateFee {
 	pub(crate) channel_id: [u8; 32],
 	pub(crate) feerate_per_kw: u32,
 }

+#[derive(PartialEq)]
 pub(crate) struct DataLossProtect {
 	pub(crate) your_last_per_commitment_secret: [u8; 32],
 	pub(crate) my_current_per_commitment_point: PublicKey,
 }

 /// A channel_reestablish message to be sent or received from a peer
+#[derive(PartialEq)]
 pub struct ChannelReestablish {
 	pub(crate) channel_id: [u8; 32],
 	pub(crate) next_local_commitment_number: u64,
@ -463,6 +467,7 @@ pub struct HandleError { //TODO: rename me

 /// Struct used to return values from revoke_and_ack messages, containing a bunch of commitment
 /// transaction updates if they were pending.
+#[derive(PartialEq)]
 pub struct CommitmentUpdate {
 	pub(crate) update_add_htlcs: Vec<UpdateAddHTLC>,
 	pub(crate) update_fulfill_htlcs: Vec<UpdateFulfillHTLC>,
@ -629,7 +634,18 @@ pub(crate) struct OnionPacket {
 	pub(crate) hmac: [u8; 32],
 }

-#[derive(Clone)]
+impl PartialEq for OnionPacket {
+	fn eq(&self, other: &OnionPacket) -> bool {
+		for (i, j) in self.hop_data.iter().zip(other.hop_data.iter()) {
+			if i != j { return false; }
+		}
+		self.version == other.version &&
+			self.public_key == other.public_key &&
+			self.hmac == other.hmac
+	}
+}
+
+#[derive(Clone, PartialEq)]
 pub(crate) struct OnionErrorPacket {
 	// This really should be a constant size slice, but the spec lets these things be up to 128KB?
 	// (TODO) We limit it in decode to much lower...
--- a/src/ln/peer_handler.rs
+++ b/src/ln/peer_handler.rs
@ -866,6 +866,17 @@ impl<Descriptor: SocketDescriptor> PeerManager<Descriptor> {
 						Self::do_attempt_write_data(&mut descriptor, peer);
 						continue;
 					},
+					Event::SendRevokeAndACK { ref node_id, ref msg } => {
+						log_trace!(self, "Handling SendRevokeAndACK event in peer_handler for node {} for channel {}",
+								log_pubkey!(node_id),
+								log_bytes!(msg.channel_id));
+						let (mut descriptor, peer) = get_peer_for_forwarding!(node_id, {
+								//TODO: Do whatever we're gonna do for handling dropped messages
+							});
+						peer.pending_outbound_buffer.push_back(peer.channel_encryptor.encrypt_message(&encode_msg!(msg, 133)));
+						Self::do_attempt_write_data(&mut descriptor, peer);
+						continue;
+					},
 					Event::SendShutdown { ref node_id, ref msg } => {
 						log_trace!(self, "Handling Shutdown event in peer_handler for node {} for channel {}",
 								log_pubkey!(node_id),
--- a/src/util/errors.rs
+++ b/src/util/errors.rs
@ -32,7 +32,10 @@ pub enum APIError {
 	ChannelUnavailable {
 		/// A human-readable error message
 		err: &'static str
-	}
+	},
+	/// An attempt to call add_update_monitor returned an Err (ie you did this!), causing the
+	/// attempted action to fail.
+	MonitorUpdateFailed,
 }

 impl fmt::Debug for APIError {
@ -42,6 +45,7 @@ impl fmt::Debug for APIError {
 			APIError::FeeRateTooHigh {ref err, ref feerate} => write!(f, "{} feerate: {}", err, feerate),
 			APIError::RouteError {ref err} => f.write_str(err),
 			APIError::ChannelUnavailable {ref err} => f.write_str(err),
+			APIError::MonitorUpdateFailed => f.write_str("Client indicated a channel monitor update failed"),
 		}
 	}
 }
--- a/src/util/events.rs
+++ b/src/util/events.rs
@ -129,6 +129,15 @@ pub enum Event {
 		/// The update messages which should be sent. ALL messages in the struct should be sent!
 		updates: msgs::CommitmentUpdate,
 	},
+	/// Used to indicate that a revoke_and_ack message should be sent to the peer with the given node_id.
+	///
+	/// This event is handled by PeerManager::process_events if you are using a PeerManager.
+	SendRevokeAndACK {
+		/// The node_id of the node which should receive this message
+		node_id: PublicKey,
+		/// The message which should be sent.
+		msg: msgs::RevokeAndACK,
+	},
 	/// Used to indicate that a shutdown message should be sent to the peer with the given node_id.
 	///
 	/// This event is handled by PeerManager::process_events if you are using a PeerManager.
--- a/src/util/test_utils.rs
+++ b/src/util/test_utils.rs
@ -38,12 +38,14 @@ impl chaininterface::FeeEstimator for TestFeeEstimator {
 pub struct TestChannelMonitor {
 	pub added_monitors: Mutex<Vec<(OutPoint, channelmonitor::ChannelMonitor)>>,
 	pub simple_monitor: Arc<channelmonitor::SimpleManyChannelMonitor<OutPoint>>,
+	pub update_ret: Mutex<Result<(), channelmonitor::ChannelMonitorUpdateErr>>,
 }
 impl TestChannelMonitor {
 	pub fn new(chain_monitor: Arc<chaininterface::ChainWatchInterface>, broadcaster: Arc<chaininterface::BroadcasterInterface>) -> Self {
 		Self {
 			added_monitors: Mutex::new(Vec::new()),
 			simple_monitor: channelmonitor::SimpleManyChannelMonitor::new(chain_monitor, broadcaster),
+			update_ret: Mutex::new(Ok(())),
 		}
 	}
 }
@ -57,7 +59,8 @@ impl channelmonitor::ManyChannelMonitor for TestChannelMonitor {
 		w.0.clear();
 		monitor.write_for_watchtower(&mut w).unwrap(); // This at least shouldn't crash...
 		self.added_monitors.lock().unwrap().push((funding_txo, monitor.clone()));
-		self.simple_monitor.add_update_monitor(funding_txo, monitor)
+		assert!(self.simple_monitor.add_update_monitor(funding_txo, monitor).is_ok());
+		self.update_ret.lock().unwrap().clone()
 	}
 }