mirror of
https://github.com/lightningdevkit/rust-lightning.git
synced 2025-02-25 15:20:24 +01:00
Merge pull request #2280 from TheBlueMatt/2023-05-event-deadlock
Never block a thread on the `PeerManager` event handling lock
This commit is contained in:
commit
64c58a565b
2 changed files with 383 additions and 334 deletions
|
@ -36,7 +36,7 @@ use crate::prelude::*;
|
|||
use crate::io;
|
||||
use alloc::collections::LinkedList;
|
||||
use crate::sync::{Arc, Mutex, MutexGuard, FairRwLock};
|
||||
use core::sync::atomic::{AtomicBool, AtomicU32, Ordering};
|
||||
use core::sync::atomic::{AtomicBool, AtomicU32, AtomicI32, Ordering};
|
||||
use core::{cmp, hash, fmt, mem};
|
||||
use core::ops::Deref;
|
||||
use core::convert::Infallible;
|
||||
|
@ -696,15 +696,18 @@ pub struct PeerManager<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, OM: D
|
|||
/// lock held. Entries may be added with only the `peers` read lock held (though the
|
||||
/// `Descriptor` value must already exist in `peers`).
|
||||
node_id_to_descriptor: Mutex<HashMap<PublicKey, Descriptor>>,
|
||||
/// We can only have one thread processing events at once, but we don't usually need the full
|
||||
/// `peers` write lock to do so, so instead we block on this empty mutex when entering
|
||||
/// `process_events`.
|
||||
event_processing_lock: Mutex<()>,
|
||||
/// Because event processing is global and always does all available work before returning,
|
||||
/// there is no reason for us to have many event processors waiting on the lock at once.
|
||||
/// Instead, we limit the total blocked event processors to always exactly one by setting this
|
||||
/// when an event process call is waiting.
|
||||
blocked_event_processors: AtomicBool,
|
||||
/// We can only have one thread processing events at once, but if a second call to
|
||||
/// `process_events` happens while a first call is in progress, one of the two calls needs to
|
||||
/// start from the top to ensure any new messages are also handled.
|
||||
///
|
||||
/// Because the event handler calls into user code which may block, we don't want to block a
|
||||
/// second thread waiting for another thread to handle events which is then blocked on user
|
||||
/// code, so we store an atomic counter here:
|
||||
/// * 0 indicates no event processor is running
|
||||
/// * 1 indicates an event processor is running
|
||||
/// * > 1 indicates an event processor is running but needs to start again from the top once
|
||||
/// it finishes as another thread tried to start processing events but returned early.
|
||||
event_processing_state: AtomicI32,
|
||||
|
||||
/// Used to track the last value sent in a node_announcement "timestamp" field. We ensure this
|
||||
/// value increases strictly since we don't assume access to a time source.
|
||||
|
@ -874,8 +877,7 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, OM: Deref, L: Deref, CM
|
|||
message_handler,
|
||||
peers: FairRwLock::new(HashMap::new()),
|
||||
node_id_to_descriptor: Mutex::new(HashMap::new()),
|
||||
event_processing_lock: Mutex::new(()),
|
||||
blocked_event_processors: AtomicBool::new(false),
|
||||
event_processing_state: AtomicI32::new(0),
|
||||
ephemeral_key_midstate,
|
||||
peer_counter: AtomicCounter::new(),
|
||||
gossip_processing_backlogged: AtomicBool::new(false),
|
||||
|
@ -1814,27 +1816,13 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, OM: Deref, L: Deref, CM
|
|||
/// [`ChannelManager::process_pending_htlc_forwards`]: crate::ln::channelmanager::ChannelManager::process_pending_htlc_forwards
|
||||
/// [`send_data`]: SocketDescriptor::send_data
|
||||
pub fn process_events(&self) {
|
||||
let mut _single_processor_lock = self.event_processing_lock.try_lock();
|
||||
if _single_processor_lock.is_err() {
|
||||
// While we could wake the older sleeper here with a CV and make more even waiting
|
||||
// times, that would be a lot of overengineering for a simple "reduce total waiter
|
||||
// count" goal.
|
||||
match self.blocked_event_processors.compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) {
|
||||
Err(val) => {
|
||||
debug_assert!(val, "compare_exchange failed spuriously?");
|
||||
if self.event_processing_state.fetch_add(1, Ordering::AcqRel) > 0 {
|
||||
// If we're not the first event processor to get here, just return early, the increment
|
||||
// we just did will be treated as "go around again" at the end.
|
||||
return;
|
||||
},
|
||||
Ok(val) => {
|
||||
debug_assert!(!val, "compare_exchange succeeded spuriously?");
|
||||
// We're the only waiter, as the running process_events may have emptied the
|
||||
// pending events "long" ago and there are new events for us to process, wait until
|
||||
// its done and process any leftover events before returning.
|
||||
_single_processor_lock = Ok(self.event_processing_lock.lock().unwrap());
|
||||
self.blocked_event_processors.store(false, Ordering::Release);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
self.update_gossip_backlogged();
|
||||
let flush_read_disabled = self.gossip_processing_backlog_lifted.swap(false, Ordering::Relaxed);
|
||||
|
||||
|
@ -2165,6 +2153,15 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, OM: Deref, L: Deref, CM
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if self.event_processing_state.fetch_sub(1, Ordering::AcqRel) != 1 {
|
||||
// If another thread incremented the state while we were running we should go
|
||||
// around again, but only once.
|
||||
self.event_processing_state.store(1, Ordering::Release);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/// Indicates that the given socket descriptor's connection is now closed.
|
||||
|
@ -3004,4 +3001,53 @@ mod tests {
|
|||
// For (None)
|
||||
assert_eq!(filter_addresses(None), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "std")]
|
||||
fn test_process_events_multithreaded() {
|
||||
use std::time::{Duration, Instant};
|
||||
// Test that `process_events` getting called on multiple threads doesn't generate too many
|
||||
// loop iterations.
|
||||
// Each time `process_events` goes around the loop we call
|
||||
// `get_and_clear_pending_msg_events`, which we count using the `TestMessageHandler`.
|
||||
// Because the loop should go around once more after a call which fails to take the
|
||||
// single-threaded lock, if we write zero to the counter before calling `process_events` we
|
||||
// should never observe there having been more than 2 loop iterations.
|
||||
// Further, because the last thread to exit will call `process_events` before returning, we
|
||||
// should always have at least one count at the end.
|
||||
let cfg = Arc::new(create_peermgr_cfgs(1));
|
||||
// Until we have std::thread::scoped we have to unsafe { turn off the borrow checker }.
|
||||
let peer = Arc::new(create_network(1, unsafe { &*(&*cfg as *const _) as &'static _ }).pop().unwrap());
|
||||
|
||||
let exit_flag = Arc::new(AtomicBool::new(false));
|
||||
macro_rules! spawn_thread { () => { {
|
||||
let thread_cfg = Arc::clone(&cfg);
|
||||
let thread_peer = Arc::clone(&peer);
|
||||
let thread_exit = Arc::clone(&exit_flag);
|
||||
std::thread::spawn(move || {
|
||||
while !thread_exit.load(Ordering::Acquire) {
|
||||
thread_cfg[0].chan_handler.message_fetch_counter.store(0, Ordering::Release);
|
||||
thread_peer.process_events();
|
||||
std::thread::sleep(Duration::from_micros(1));
|
||||
}
|
||||
})
|
||||
} } }
|
||||
|
||||
let thread_a = spawn_thread!();
|
||||
let thread_b = spawn_thread!();
|
||||
let thread_c = spawn_thread!();
|
||||
|
||||
let start_time = Instant::now();
|
||||
while start_time.elapsed() < Duration::from_millis(100) {
|
||||
let val = cfg[0].chan_handler.message_fetch_counter.load(Ordering::Acquire);
|
||||
assert!(val <= 2);
|
||||
std::thread::yield_now(); // Winblowz seemingly doesn't ever interrupt threads?!
|
||||
}
|
||||
|
||||
exit_flag.store(true, Ordering::Release);
|
||||
thread_a.join().unwrap();
|
||||
thread_b.join().unwrap();
|
||||
thread_c.join().unwrap();
|
||||
assert!(cfg[0].chan_handler.message_fetch_counter.load(Ordering::Acquire) >= 1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -362,6 +362,7 @@ pub struct TestChannelMessageHandler {
|
|||
pub pending_events: Mutex<Vec<events::MessageSendEvent>>,
|
||||
expected_recv_msgs: Mutex<Option<Vec<wire::Message<()>>>>,
|
||||
connected_peers: Mutex<HashSet<PublicKey>>,
|
||||
pub message_fetch_counter: AtomicUsize,
|
||||
}
|
||||
|
||||
impl TestChannelMessageHandler {
|
||||
|
@ -370,6 +371,7 @@ impl TestChannelMessageHandler {
|
|||
pending_events: Mutex::new(Vec::new()),
|
||||
expected_recv_msgs: Mutex::new(None),
|
||||
connected_peers: Mutex::new(HashSet::new()),
|
||||
message_fetch_counter: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -520,6 +522,7 @@ impl msgs::ChannelMessageHandler for TestChannelMessageHandler {
|
|||
|
||||
impl events::MessageSendEventsProvider for TestChannelMessageHandler {
|
||||
fn get_and_clear_pending_msg_events(&self) -> Vec<events::MessageSendEvent> {
|
||||
self.message_fetch_counter.fetch_add(1, Ordering::AcqRel);
|
||||
let mut pending_events = self.pending_events.lock().unwrap();
|
||||
let mut ret = Vec::new();
|
||||
mem::swap(&mut ret, &mut *pending_events);
|
||||
|
|
Loading…
Add table
Reference in a new issue