From 4703d4e72565ddfd150b9368ea036f4973fd7590 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Thu, 17 Jun 2021 22:30:09 +0000 Subject: [PATCH 1/3] Do not require that no calls are made post-disconnect_socket The only practical way to meet this requirement is to block disconnect_socket until any pending events are fully processed, leading to this trivial deadlock: * Thread 1: select() woken up due to a read event * Thread 2: Event processing causes a disconnect_socket call to fire while the PeerManager lock is held. * Thread 2: disconnect_socket blocks until the read event in thread 1 completes. * Thread 1: bytes are read from the socket and PeerManager::read_event is called, waiting on the lock still held by thread 2. There isn't a trivial way to address this deadlock without simply making the final read_event call return immediately, which we do here. This also implies that users can freely call event methods after disconnect_socket, but only so far as the socket descriptor is different from any later socket descriptor (ie until the file descriptor is re-used). --- lightning/src/ln/peer_handler.rs | 37 +++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/lightning/src/ln/peer_handler.rs b/lightning/src/ln/peer_handler.rs index 2eb9b2e78..a701200be 100644 --- a/lightning/src/ln/peer_handler.rs +++ b/lightning/src/ln/peer_handler.rs @@ -194,11 +194,8 @@ pub trait SocketDescriptor : cmp::Eq + hash::Hash + Clone { /// indicating that read events on this descriptor should resume. A resume_read of false does /// *not* imply that further read events should be paused. fn send_data(&mut self, data: &[u8], resume_read: bool) -> usize; - /// Disconnect the socket pointed to by this SocketDescriptor. Once this function returns, no - /// more calls to write_buffer_space_avail, read_event or socket_disconnected may be made with - /// this descriptor. No socket_disconnected call should be generated as a result of this call, - /// though races may occur whereby disconnect_socket is called after a call to - /// socket_disconnected but prior to socket_disconnected returning. + /// Disconnect the socket pointed to by this SocketDescriptor. + /// No [`PeerManager::socket_disconnected`] call need be generated as a result of this call. fn disconnect_socket(&mut self); } @@ -616,7 +613,12 @@ impl PeerManager Result<(), PeerHandleError> { let mut peers = self.peers.lock().unwrap(); match peers.peers.get_mut(descriptor) { - None => panic!("Descriptor for write_event is not already known to PeerManager"), + None => { + // This is most likely a simple race condition where the user found that the socket + // was writeable, then we told the user to `disconnect_socket()`, then they called + // this method. Return an error to make sure we get disconnected. + return Err(PeerHandleError { no_connection_possible: false }); + }, Some(peer) => { peer.awaiting_write_event = false; self.do_attempt_write_data(descriptor, peer); @@ -636,7 +638,6 @@ impl PeerManager Result { match self.do_read_event(peer_descriptor, data) { Ok(res) => Ok(res), @@ -664,7 +665,12 @@ impl PeerManager panic!("Descriptor for read_event is not already known to PeerManager"), + None => { + // This is most likely a simple race condition where the user read some bytes + // from the socket, then we told the user to `disconnect_socket()`, then they + // called this method. Return an error to make sure we get disconnected. + return Err(PeerHandleError { no_connection_possible: false }); + }, Some(peer) => { assert!(peer.pending_read_buffer.len() > 0); assert!(peer.pending_read_buffer.len() > peer.pending_read_buffer_pos); @@ -1292,12 +1298,9 @@ impl PeerManager PeerManager panic!("Descriptor for disconnect_event is not already known to PeerManager"), + None => { + // This is most likely a simple race condition where the user found that the socket + // was disconnected, then we told the user to `disconnect_socket()`, then they + // called this method. Either way we're disconnected, return. + }, Some(peer) => { match peer.their_node_id { Some(node_id) => { From 656ed89388db8efab98021508ca462c7e460c278 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Thu, 17 Jun 2021 22:54:06 +0000 Subject: [PATCH 2/3] No longer block disconnect_socket calls in lightning-net-tokio See the previous commit for more information. --- lightning-net-tokio/src/lib.rs | 47 ++++++++++------------------------ 1 file changed, 13 insertions(+), 34 deletions(-) diff --git a/lightning-net-tokio/src/lib.rs b/lightning-net-tokio/src/lib.rs index d102778a4..5f5fece0d 100644 --- a/lightning-net-tokio/src/lib.rs +++ b/lightning-net-tokio/src/lib.rs @@ -83,7 +83,7 @@ use lightning::ln::peer_handler::SocketDescriptor as LnSocketTrait; use lightning::ln::msgs::{ChannelMessageHandler, RoutingMessageHandler}; use lightning::util::logger::Logger; -use std::{task, thread}; +use std::task; use std::net::SocketAddr; use std::net::TcpStream as StdTcpStream; use std::sync::{Arc, Mutex}; @@ -114,11 +114,6 @@ struct Connection { // socket. To wake it up (without otherwise changing its state, we can push a value into this // Sender. read_waker: mpsc::Sender<()>, - // When we are told by rust-lightning to disconnect, we can't return to rust-lightning until we - // are sure we won't call any more read/write PeerManager functions with the same connection. - // This is set to true if we're in such a condition (with disconnect checked before with the - // top-level mutex held) and false when we can return. - block_disconnect_socket: bool, read_paused: bool, rl_requested_disconnect: bool, id: u64, @@ -153,31 +148,24 @@ impl Connection { } } } - macro_rules! prepare_read_write_call { - () => { { - let mut us_lock = us.lock().unwrap(); - if us_lock.rl_requested_disconnect { - shutdown_socket!("disconnect_socket() call from RL", Disconnect::CloseConnection); - } - us_lock.block_disconnect_socket = true; - } } - } - - let read_paused = us.lock().unwrap().read_paused; + let read_paused = { + let us_lock = us.lock().unwrap(); + if us_lock.rl_requested_disconnect { + shutdown_socket!("disconnect_socket() call from RL", Disconnect::CloseConnection); + } + us_lock.read_paused + }; tokio::select! { v = write_avail_receiver.recv() => { assert!(v.is_some()); // We can't have dropped the sending end, its in the us Arc! - prepare_read_write_call!(); if let Err(e) = peer_manager.write_buffer_space_avail(&mut our_descriptor) { shutdown_socket!(e, Disconnect::CloseConnection); } - us.lock().unwrap().block_disconnect_socket = false; }, _ = read_wake_receiver.recv() => {}, read = reader.read(&mut buf), if !read_paused => match read { Ok(0) => shutdown_socket!("Connection closed", Disconnect::PeerDisconnected), Ok(len) => { - prepare_read_write_call!(); let read_res = peer_manager.read_event(&mut our_descriptor, &buf[0..len]); let mut us_lock = us.lock().unwrap(); match read_res { @@ -188,7 +176,6 @@ impl Connection { }, Err(e) => shutdown_socket!(e, Disconnect::CloseConnection), } - us_lock.block_disconnect_socket = false; }, Err(e) => shutdown_socket!(e, Disconnect::PeerDisconnected), }, @@ -223,7 +210,7 @@ impl Connection { (reader, write_receiver, read_receiver, Arc::new(Mutex::new(Self { writer: Some(writer), write_avail, read_waker, read_paused: false, - block_disconnect_socket: false, rl_requested_disconnect: false, + rl_requested_disconnect: false, id: ID_COUNTER.fetch_add(1, Ordering::AcqRel) }))) } @@ -450,18 +437,10 @@ impl peer_handler::SocketDescriptor for SocketDescriptor { } fn disconnect_socket(&mut self) { - { - let mut us = self.conn.lock().unwrap(); - us.rl_requested_disconnect = true; - us.read_paused = true; - // Wake up the sending thread, assuming it is still alive - let _ = us.write_avail.try_send(()); - // Happy-path return: - if !us.block_disconnect_socket { return; } - } - while self.conn.lock().unwrap().block_disconnect_socket { - thread::yield_now(); - } + let mut us = self.conn.lock().unwrap(); + us.rl_requested_disconnect = true; + // Wake up the sending thread, assuming it is still alive + let _ = us.write_avail.try_send(()); } } impl Clone for SocketDescriptor { From 05157b1755c1cf33bb80bc7ac41dfb52b58d2a74 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Thu, 17 Jun 2021 22:54:46 +0000 Subject: [PATCH 3/3] Clean up docs on peer_handler significantly. There are various typo and grammatical fixes here, as well as concrete updates to correctness. --- lightning/src/ln/peer_handler.rs | 132 ++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 47 deletions(-) diff --git a/lightning/src/ln/peer_handler.rs b/lightning/src/ln/peer_handler.rs index a701200be..5546227ef 100644 --- a/lightning/src/ln/peer_handler.rs +++ b/lightning/src/ln/peer_handler.rs @@ -160,10 +160,15 @@ pub struct MessageHandler where CM::Target: ChannelMessageHandler, RM::Target: RoutingMessageHandler { /// A message handler which handles messages specific to channels. Usually this is just a - /// ChannelManager object or a ErroringMessageHandler. + /// [`ChannelManager`] object or an [`ErroringMessageHandler`]. + /// + /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager pub chan_handler: CM, /// A message handler which handles messages updating our knowledge of the network channel - /// graph. Usually this is just a NetGraphMsgHandlerMonitor object or an IgnoringMessageHandler. + /// graph. Usually this is just a [`NetGraphMsgHandler`] object or an + /// [`IgnoringMessageHandler`]. + /// + /// [`NetGraphMsgHandler`]: crate::routing::network_graph::NetGraphMsgHandler pub route_handler: RM, } @@ -173,29 +178,35 @@ pub struct MessageHandler where /// /// For efficiency, Clone should be relatively cheap for this type. /// -/// You probably want to just extend an int and put a file descriptor in a struct and implement -/// send_data. Note that if you are using a higher-level net library that may call close() itself, -/// be careful to ensure you don't have races whereby you might register a new connection with an -/// fd which is the same as a previous one which has yet to be removed via -/// PeerManager::socket_disconnected(). +/// Two descriptors may compare equal (by [`cmp::Eq`] and [`hash::Hash`]) as long as the original +/// has been disconnected, the [`PeerManager`] has been informed of the disconnection (either by it +/// having triggered the disconnection or a call to [`PeerManager::socket_disconnected`]), and no +/// further calls to the [`PeerManager`] related to the original socket occur. This allows you to +/// use a file descriptor for your SocketDescriptor directly, however for simplicity you may wish +/// to simply use another value which is guaranteed to be globally unique instead. pub trait SocketDescriptor : cmp::Eq + hash::Hash + Clone { /// Attempts to send some data from the given slice to the peer. /// /// Returns the amount of data which was sent, possibly 0 if the socket has since disconnected. - /// Note that in the disconnected case, socket_disconnected must still fire and further write - /// attempts may occur until that time. + /// Note that in the disconnected case, [`PeerManager::socket_disconnected`] must still be + /// called and further write attempts may occur until that time. /// - /// If the returned size is smaller than data.len(), a write_available event must - /// trigger the next time more data can be written. Additionally, until the a send_data event - /// completes fully, no further read_events should trigger on the same peer! + /// If the returned size is smaller than `data.len()`, a + /// [`PeerManager::write_buffer_space_avail`] call must be made the next time more data can be + /// written. Additionally, until a `send_data` event completes fully, no further + /// [`PeerManager::read_event`] calls should be made for the same peer! Because this is to + /// prevent denial-of-service issues, you should not read or buffer any data from the socket + /// until then. /// - /// If a read_event on this descriptor had previously returned true (indicating that read - /// events should be paused to prevent DoS in the send buffer), resume_read may be set - /// indicating that read events on this descriptor should resume. A resume_read of false does - /// *not* imply that further read events should be paused. + /// If a [`PeerManager::read_event`] call on this descriptor had previously returned true + /// (indicating that read events should be paused to prevent DoS in the send buffer), + /// `resume_read` may be set indicating that read events on this descriptor should resume. A + /// `resume_read` of false carries no meaning, and should not cause any action. fn send_data(&mut self, data: &[u8], resume_read: bool) -> usize; /// Disconnect the socket pointed to by this SocketDescriptor. - /// No [`PeerManager::socket_disconnected`] call need be generated as a result of this call. + /// + /// You do *not* need to call [`PeerManager::socket_disconnected`] with this socket after this + /// call (doing so is a noop). fn disconnect_socket(&mut self); } @@ -309,14 +320,25 @@ pub type SimpleArcPeerManager = PeerManager = PeerManager, &'e NetGraphMsgHandler<&'g C, &'f L>, &'f L>; -/// A PeerManager manages a set of peers, described by their SocketDescriptor and marshalls socket -/// events into messages which it passes on to its MessageHandlers. +/// A PeerManager manages a set of peers, described by their [`SocketDescriptor`] and marshalls +/// socket events into messages which it passes on to its [`MessageHandler`]. +/// +/// Locks are taken internally, so you must never assume that reentrancy from a +/// [`SocketDescriptor`] call back into [`PeerManager`] methods will not deadlock. +/// +/// Calls to [`read_event`] will decode relevant messages and pass them to the +/// [`ChannelMessageHandler`], likely doing message processing in-line. Thus, the primary form of +/// parallelism in Rust-Lightning is in calls to [`read_event`]. Note, however, that calls to any +/// [`PeerManager`] functions related to the same connection must occur only in serial, making new +/// calls only after previous ones have returned. /// /// Rather than using a plain PeerManager, it is preferable to use either a SimpleArcPeerManager /// a SimpleRefPeerManager, for conciseness. See their documentation for more details, but /// essentially you should default to using a SimpleRefPeerManager, and use a /// SimpleArcPeerManager when you require a PeerManager with a static lifetime, such as when /// you're using lightning-net-tokio. +/// +/// [`read_event`]: PeerManager::read_event pub struct PeerManager where CM::Target: ChannelMessageHandler, RM::Target: RoutingMessageHandler, @@ -397,8 +419,6 @@ impl PeerManager PeerManager where CM::Target: ChannelMessageHandler, RM::Target: RoutingMessageHandler, @@ -458,8 +478,10 @@ impl PeerManager Result, PeerHandleError> { let mut peer_encryptor = PeerChannelEncryptor::new_outbound(their_node_id.clone(), self.get_ephemeral_key()); let res = peer_encryptor.get_act_one().to_vec(); @@ -495,8 +517,10 @@ impl PeerManager Result<(), PeerHandleError> { let peer_encryptor = PeerChannelEncryptor::new_inbound(&self.our_node_secret); let pending_read_buffer = [0; 50].to_vec(); // Noise act one is 50 bytes @@ -604,12 +628,14 @@ impl PeerManager Result<(), PeerHandleError> { let mut peers = self.peers.lock().unwrap(); match peers.peers.get_mut(descriptor) { @@ -631,13 +657,16 @@ impl PeerManager Result { match self.do_read_event(peer_descriptor, data) { Ok(res) => Ok(res), @@ -1085,7 +1114,14 @@ impl PeerManager PeerManager PeerManager PeerManager