Merge pull request #957 from TheBlueMatt/2021-06-p2p-no-deadlock

Fix P2P Deadlocks
This commit is contained in:
Matt Corallo 2021-06-23 01:34:55 +00:00 committed by GitHub
commit 073afbb244
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 116 additions and 92 deletions

View file

@ -83,7 +83,7 @@ use lightning::ln::peer_handler::SocketDescriptor as LnSocketTrait;
use lightning::ln::msgs::{ChannelMessageHandler, RoutingMessageHandler};
use lightning::util::logger::Logger;
use std::{task, thread};
use std::task;
use std::net::SocketAddr;
use std::net::TcpStream as StdTcpStream;
use std::sync::{Arc, Mutex};
@ -114,11 +114,6 @@ struct Connection {
// socket. To wake it up (without otherwise changing its state, we can push a value into this
// Sender.
read_waker: mpsc::Sender<()>,
// When we are told by rust-lightning to disconnect, we can't return to rust-lightning until we
// are sure we won't call any more read/write PeerManager functions with the same connection.
// This is set to true if we're in such a condition (with disconnect checked before with the
// top-level mutex held) and false when we can return.
block_disconnect_socket: bool,
read_paused: bool,
rl_requested_disconnect: bool,
id: u64,
@ -153,31 +148,24 @@ impl Connection {
} }
}
macro_rules! prepare_read_write_call {
() => { {
let mut us_lock = us.lock().unwrap();
if us_lock.rl_requested_disconnect {
shutdown_socket!("disconnect_socket() call from RL", Disconnect::CloseConnection);
}
us_lock.block_disconnect_socket = true;
} }
}
let read_paused = us.lock().unwrap().read_paused;
let read_paused = {
let us_lock = us.lock().unwrap();
if us_lock.rl_requested_disconnect {
shutdown_socket!("disconnect_socket() call from RL", Disconnect::CloseConnection);
}
us_lock.read_paused
};
tokio::select! {
v = write_avail_receiver.recv() => {
assert!(v.is_some()); // We can't have dropped the sending end, its in the us Arc!
prepare_read_write_call!();
if let Err(e) = peer_manager.write_buffer_space_avail(&mut our_descriptor) {
shutdown_socket!(e, Disconnect::CloseConnection);
}
us.lock().unwrap().block_disconnect_socket = false;
},
_ = read_wake_receiver.recv() => {},
read = reader.read(&mut buf), if !read_paused => match read {
Ok(0) => shutdown_socket!("Connection closed", Disconnect::PeerDisconnected),
Ok(len) => {
prepare_read_write_call!();
let read_res = peer_manager.read_event(&mut our_descriptor, &buf[0..len]);
let mut us_lock = us.lock().unwrap();
match read_res {
@ -188,7 +176,6 @@ impl Connection {
},
Err(e) => shutdown_socket!(e, Disconnect::CloseConnection),
}
us_lock.block_disconnect_socket = false;
},
Err(e) => shutdown_socket!(e, Disconnect::PeerDisconnected),
},
@ -223,7 +210,7 @@ impl Connection {
(reader, write_receiver, read_receiver,
Arc::new(Mutex::new(Self {
writer: Some(writer), write_avail, read_waker, read_paused: false,
block_disconnect_socket: false, rl_requested_disconnect: false,
rl_requested_disconnect: false,
id: ID_COUNTER.fetch_add(1, Ordering::AcqRel)
})))
}
@ -450,18 +437,10 @@ impl peer_handler::SocketDescriptor for SocketDescriptor {
}
fn disconnect_socket(&mut self) {
{
let mut us = self.conn.lock().unwrap();
us.rl_requested_disconnect = true;
us.read_paused = true;
// Wake up the sending thread, assuming it is still alive
let _ = us.write_avail.try_send(());
// Happy-path return:
if !us.block_disconnect_socket { return; }
}
while self.conn.lock().unwrap().block_disconnect_socket {
thread::yield_now();
}
let mut us = self.conn.lock().unwrap();
us.rl_requested_disconnect = true;
// Wake up the sending thread, assuming it is still alive
let _ = us.write_avail.try_send(());
}
}
impl Clone for SocketDescriptor {

View file

@ -160,10 +160,15 @@ pub struct MessageHandler<CM: Deref, RM: Deref> where
CM::Target: ChannelMessageHandler,
RM::Target: RoutingMessageHandler {
/// A message handler which handles messages specific to channels. Usually this is just a
/// ChannelManager object or a ErroringMessageHandler.
/// [`ChannelManager`] object or an [`ErroringMessageHandler`].
///
/// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager
pub chan_handler: CM,
/// A message handler which handles messages updating our knowledge of the network channel
/// graph. Usually this is just a NetGraphMsgHandlerMonitor object or an IgnoringMessageHandler.
/// graph. Usually this is just a [`NetGraphMsgHandler`] object or an
/// [`IgnoringMessageHandler`].
///
/// [`NetGraphMsgHandler`]: crate::routing::network_graph::NetGraphMsgHandler
pub route_handler: RM,
}
@ -173,32 +178,35 @@ pub struct MessageHandler<CM: Deref, RM: Deref> where
///
/// For efficiency, Clone should be relatively cheap for this type.
///
/// You probably want to just extend an int and put a file descriptor in a struct and implement
/// send_data. Note that if you are using a higher-level net library that may call close() itself,
/// be careful to ensure you don't have races whereby you might register a new connection with an
/// fd which is the same as a previous one which has yet to be removed via
/// PeerManager::socket_disconnected().
/// Two descriptors may compare equal (by [`cmp::Eq`] and [`hash::Hash`]) as long as the original
/// has been disconnected, the [`PeerManager`] has been informed of the disconnection (either by it
/// having triggered the disconnection or a call to [`PeerManager::socket_disconnected`]), and no
/// further calls to the [`PeerManager`] related to the original socket occur. This allows you to
/// use a file descriptor for your SocketDescriptor directly, however for simplicity you may wish
/// to simply use another value which is guaranteed to be globally unique instead.
pub trait SocketDescriptor : cmp::Eq + hash::Hash + Clone {
/// Attempts to send some data from the given slice to the peer.
///
/// Returns the amount of data which was sent, possibly 0 if the socket has since disconnected.
/// Note that in the disconnected case, socket_disconnected must still fire and further write
/// attempts may occur until that time.
/// Note that in the disconnected case, [`PeerManager::socket_disconnected`] must still be
/// called and further write attempts may occur until that time.
///
/// If the returned size is smaller than data.len(), a write_available event must
/// trigger the next time more data can be written. Additionally, until the a send_data event
/// completes fully, no further read_events should trigger on the same peer!
/// If the returned size is smaller than `data.len()`, a
/// [`PeerManager::write_buffer_space_avail`] call must be made the next time more data can be
/// written. Additionally, until a `send_data` event completes fully, no further
/// [`PeerManager::read_event`] calls should be made for the same peer! Because this is to
/// prevent denial-of-service issues, you should not read or buffer any data from the socket
/// until then.
///
/// If a read_event on this descriptor had previously returned true (indicating that read
/// events should be paused to prevent DoS in the send buffer), resume_read may be set
/// indicating that read events on this descriptor should resume. A resume_read of false does
/// *not* imply that further read events should be paused.
/// If a [`PeerManager::read_event`] call on this descriptor had previously returned true
/// (indicating that read events should be paused to prevent DoS in the send buffer),
/// `resume_read` may be set indicating that read events on this descriptor should resume. A
/// `resume_read` of false carries no meaning, and should not cause any action.
fn send_data(&mut self, data: &[u8], resume_read: bool) -> usize;
/// Disconnect the socket pointed to by this SocketDescriptor. Once this function returns, no
/// more calls to write_buffer_space_avail, read_event or socket_disconnected may be made with
/// this descriptor. No socket_disconnected call should be generated as a result of this call,
/// though races may occur whereby disconnect_socket is called after a call to
/// socket_disconnected but prior to socket_disconnected returning.
/// Disconnect the socket pointed to by this SocketDescriptor.
///
/// You do *not* need to call [`PeerManager::socket_disconnected`] with this socket after this
/// call (doing so is a noop).
fn disconnect_socket(&mut self);
}
@ -312,14 +320,25 @@ pub type SimpleArcPeerManager<SD, M, T, F, C, L> = PeerManager<SD, Arc<SimpleArc
/// helps with issues such as long function definitions.
pub type SimpleRefPeerManager<'a, 'b, 'c, 'd, 'e, 'f, 'g, SD, M, T, F, C, L> = PeerManager<SD, SimpleRefChannelManager<'a, 'b, 'c, 'd, 'e, M, T, F, L>, &'e NetGraphMsgHandler<&'g C, &'f L>, &'f L>;
/// A PeerManager manages a set of peers, described by their SocketDescriptor and marshalls socket
/// events into messages which it passes on to its MessageHandlers.
/// A PeerManager manages a set of peers, described by their [`SocketDescriptor`] and marshalls
/// socket events into messages which it passes on to its [`MessageHandler`].
///
/// Locks are taken internally, so you must never assume that reentrancy from a
/// [`SocketDescriptor`] call back into [`PeerManager`] methods will not deadlock.
///
/// Calls to [`read_event`] will decode relevant messages and pass them to the
/// [`ChannelMessageHandler`], likely doing message processing in-line. Thus, the primary form of
/// parallelism in Rust-Lightning is in calls to [`read_event`]. Note, however, that calls to any
/// [`PeerManager`] functions related to the same connection must occur only in serial, making new
/// calls only after previous ones have returned.
///
/// Rather than using a plain PeerManager, it is preferable to use either a SimpleArcPeerManager
/// a SimpleRefPeerManager, for conciseness. See their documentation for more details, but
/// essentially you should default to using a SimpleRefPeerManager, and use a
/// SimpleArcPeerManager when you require a PeerManager with a static lifetime, such as when
/// you're using lightning-net-tokio.
///
/// [`read_event`]: PeerManager::read_event
pub struct PeerManager<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> where
CM::Target: ChannelMessageHandler,
RM::Target: RoutingMessageHandler,
@ -400,8 +419,6 @@ impl<Descriptor: SocketDescriptor, RM: Deref, L: Deref> PeerManager<Descriptor,
}
}
/// Manages and reacts to connection events. You probably want to use file descriptors as PeerIds.
/// PeerIds may repeat, but only after socket_disconnected() has been called.
impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<Descriptor, CM, RM, L> where
CM::Target: ChannelMessageHandler,
RM::Target: RoutingMessageHandler,
@ -461,8 +478,10 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
///
/// Returns a small number of bytes to send to the remote node (currently always 50).
///
/// Panics if descriptor is duplicative with some other descriptor which has not yet had a
/// socket_disconnected().
/// Panics if descriptor is duplicative with some other descriptor which has not yet been
/// [`socket_disconnected()`].
///
/// [`socket_disconnected()`]: PeerManager::socket_disconnected
pub fn new_outbound_connection(&self, their_node_id: PublicKey, descriptor: Descriptor) -> Result<Vec<u8>, PeerHandleError> {
let mut peer_encryptor = PeerChannelEncryptor::new_outbound(their_node_id.clone(), self.get_ephemeral_key());
let res = peer_encryptor.get_act_one().to_vec();
@ -498,8 +517,10 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
/// call socket_disconnected for the new descriptor but must disconnect the connection
/// immediately.
///
/// Panics if descriptor is duplicative with some other descriptor which has not yet had
/// socket_disconnected called.
/// Panics if descriptor is duplicative with some other descriptor which has not yet been
/// [`socket_disconnected()`].
///
/// [`socket_disconnected()`]: PeerManager::socket_disconnected
pub fn new_inbound_connection(&self, descriptor: Descriptor) -> Result<(), PeerHandleError> {
let peer_encryptor = PeerChannelEncryptor::new_inbound(&self.our_node_secret);
let pending_read_buffer = [0; 50].to_vec(); // Noise act one is 50 bytes
@ -607,16 +628,23 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
///
/// May return an Err to indicate that the connection should be closed.
///
/// Will most likely call send_data on the descriptor passed in (or the descriptor handed into
/// new_*\_connection) before returning. Thus, be very careful with reentrancy issues! The
/// invariants around calling write_buffer_space_avail in case a write did not fully complete
/// must still hold - be ready to call write_buffer_space_avail again if a write call generated
/// here isn't sufficient! Panics if the descriptor was not previously registered in a
/// new_\*_connection event.
/// May call [`send_data`] on the descriptor passed in (or an equal descriptor) before
/// returning. Thus, be very careful with reentrancy issues! The invariants around calling
/// [`write_buffer_space_avail`] in case a write did not fully complete must still hold - be
/// ready to call `[write_buffer_space_avail`] again if a write call generated here isn't
/// sufficient!
///
/// [`send_data`]: SocketDescriptor::send_data
/// [`write_buffer_space_avail`]: PeerManager::write_buffer_space_avail
pub fn write_buffer_space_avail(&self, descriptor: &mut Descriptor) -> Result<(), PeerHandleError> {
let mut peers = self.peers.lock().unwrap();
match peers.peers.get_mut(descriptor) {
None => panic!("Descriptor for write_event is not already known to PeerManager"),
None => {
// This is most likely a simple race condition where the user found that the socket
// was writeable, then we told the user to `disconnect_socket()`, then they called
// this method. Return an error to make sure we get disconnected.
return Err(PeerHandleError { no_connection_possible: false });
},
Some(peer) => {
peer.awaiting_write_event = false;
self.do_attempt_write_data(descriptor, peer);
@ -629,14 +657,16 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
///
/// May return an Err to indicate that the connection should be closed.
///
/// Will *not* call back into send_data on any descriptors to avoid reentrancy complexity.
/// Thus, however, you almost certainly want to call process_events() after any read_event to
/// generate send_data calls to handle responses.
/// Will *not* call back into [`send_data`] on any descriptors to avoid reentrancy complexity.
/// Thus, however, you should call [`process_events`] after any `read_event` to generate
/// [`send_data`] calls to handle responses.
///
/// If Ok(true) is returned, further read_events should not be triggered until a send_data call
/// on this file descriptor has resume_read set (preventing DoS issues in the send buffer).
/// If `Ok(true)` is returned, further read_events should not be triggered until a
/// [`send_data`] call on this descriptor has `resume_read` set (preventing DoS issues in the
/// send buffer).
///
/// Panics if the descriptor was not previously registered in a new_*_connection event.
/// [`send_data`]: SocketDescriptor::send_data
/// [`process_events`]: PeerManager::process_events
pub fn read_event(&self, peer_descriptor: &mut Descriptor, data: &[u8]) -> Result<bool, PeerHandleError> {
match self.do_read_event(peer_descriptor, data) {
Ok(res) => Ok(res),
@ -664,7 +694,12 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
let mut msgs_to_forward = Vec::new();
let mut peer_node_id = None;
let pause_read = match peers.peers.get_mut(peer_descriptor) {
None => panic!("Descriptor for read_event is not already known to PeerManager"),
None => {
// This is most likely a simple race condition where the user read some bytes
// from the socket, then we told the user to `disconnect_socket()`, then they
// called this method. Return an error to make sure we get disconnected.
return Err(PeerHandleError { no_connection_possible: false });
},
Some(peer) => {
assert!(peer.pending_read_buffer.len() > 0);
assert!(peer.pending_read_buffer.len() > peer.pending_read_buffer_pos);
@ -1079,7 +1114,14 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
/// Checks for any events generated by our handlers and processes them. Includes sending most
/// response messages as well as messages generated by calls to handler functions directly (eg
/// functions like ChannelManager::process_pending_htlc_forward or send_payment).
/// functions like [`ChannelManager::process_pending_htlc_forwards`] or [`send_payment`]).
///
/// May call [`send_data`] on [`SocketDescriptor`]s. Thus, be very careful with reentrancy
/// issues!
///
/// [`send_payment`]: crate::ln::channelmanager::ChannelManager::send_payment
/// [`ChannelManager::process_pending_htlc_forwards`]: crate::ln::channelmanager::ChannelManager::process_pending_htlc_forwards
/// [`send_data`]: SocketDescriptor::send_data
pub fn process_events(&self) {
{
// TODO: There are some DoS attacks here where you can flood someone's outbound send
@ -1291,13 +1333,6 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
}
/// Indicates that the given socket descriptor's connection is now closed.
///
/// This must only be called if the socket has been disconnected by the peer or your own
/// decision to disconnect it and must NOT be called in any case where other parts of this
/// library (eg PeerHandleError, explicit disconnect_socket calls) instruct you to disconnect
/// the peer.
///
/// Panics if the descriptor was not previously registered in a successful new_*_connection event.
pub fn socket_disconnected(&self, descriptor: &Descriptor) {
self.disconnect_event_internal(descriptor, false);
}
@ -1306,7 +1341,11 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
let mut peers = self.peers.lock().unwrap();
let peer_option = peers.peers.remove(descriptor);
match peer_option {
None => panic!("Descriptor for disconnect_event is not already known to PeerManager"),
None => {
// This is most likely a simple race condition where the user found that the socket
// was disconnected, then we told the user to `disconnect_socket()`, then they
// called this method. Either way we're disconnected, return.
},
Some(peer) => {
match peer.their_node_id {
Some(node_id) => {
@ -1321,11 +1360,13 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
/// Disconnect a peer given its node id.
///
/// Set no_connection_possible to true to prevent any further connection with this peer,
/// Set `no_connection_possible` to true to prevent any further connection with this peer,
/// force-closing any channels we have with it.
///
/// If a peer is connected, this will call `disconnect_socket` on the descriptor for the peer,
/// so be careful about reentrancy issues.
/// If a peer is connected, this will call [`disconnect_socket`] on the descriptor for the
/// peer. Thus, be very careful about reentrancy issues.
///
/// [`disconnect_socket`]: SocketDescriptor::disconnect_socket
pub fn disconnect_by_node_id(&self, node_id: PublicKey, no_connection_possible: bool) {
let mut peers_lock = self.peers.lock().unwrap();
if let Some(mut descriptor) = peers_lock.node_id_to_descriptor.remove(&node_id) {
@ -1337,9 +1378,13 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref> PeerManager<D
}
/// This function should be called roughly once every 30 seconds.
/// It will send pings to each peer and disconnect those which did not respond to the last round of pings.
/// Will most likely call send_data on all of the registered descriptors, thus, be very careful with reentrancy issues!
/// It will send pings to each peer and disconnect those which did not respond to the last
/// round of pings.
///
/// May call [`send_data`] on all [`SocketDescriptor`]s. Thus, be very careful with reentrancy
/// issues!
///
/// [`send_data`]: SocketDescriptor::send_data
pub fn timer_tick_occurred(&self) {
let mut peers_lock = self.peers.lock().unwrap();
{