gossip: Fix a race condition between release_peer and fail_peer

There was a race condition that would cause an assertion to segfault
if a call to release_peer was interleaved with a fail_peer. The
release_peer was making the peer non-local, which was then causing the
assertion in fail_peer to fail. Now we just have 3 cases: not found,
local, and non-local.

Signed-off-by: Christian Decker <decker.christian@gmail.com>
This commit is contained in:
Christian Decker 2017-08-14 22:44:44 +02:00 committed by Rusty Russell
parent 9ad8365589
commit 6dbd99ddc6

View File

@ -442,9 +442,11 @@ static struct io_plan *release_peer(struct io_conn *conn, struct daemon *daemon,
"%s", tal_hex(trc, msg));
peer = find_peer(daemon, unique_id);
if (!peer) {
if (!peer || !peer->local) {
/* This can happen with a reconnect vs connect race.
* See gossip_peer_released in master daemon. */
* See gossip_peer_released in master daemon. It may
* also happen if we asked to release just before
* failing the peer*/
daemon_conn_send(&daemon->master,
take(towire_gossipctl_release_peer_replyfail(msg)));
} else {
@ -470,11 +472,13 @@ static struct io_plan *fail_peer(struct io_conn *conn, struct daemon *daemon,
peer = find_peer(daemon, unique_id);
if (!peer)
status_trace("Unknown fail_peer %"PRIu64, unique_id);
else {
assert(peer->local);
else if (peer->local) {
status_trace("fail_peer %"PRIu64, unique_id);
/* This owns the peer, so we can free it */
io_close(peer->conn);
} else {
status_trace("Could not fail_peer %"PRIu64", it's not local",
unique_id);
}
return daemon_conn_read_next(conn, &daemon->master);