From 293cf3c2b21940f4425f4f6e664b0c6f03cd44ff Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 23 Mar 2022 09:31:36 +1030 Subject: [PATCH] connect: delay return until all subds ready. We had some flakes because we returned from `connect`, but we hadn't started subds yet. Signed-off-by: Rusty Russell --- doc/lightning-connect.7.md | 4 +++ lightningd/peer_control.c | 57 ++++++++++++++++++++------------------ 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/doc/lightning-connect.7.md b/doc/lightning-connect.7.md index e596f7a76..c2e9851f7 100644 --- a/doc/lightning-connect.7.md +++ b/doc/lightning-connect.7.md @@ -36,6 +36,10 @@ Connecting to a node is just the first step in opening a channel with another node. Once the peer is connected a channel can be opened with lightning-fundchannel(7). +If there are active channels with the peer, **connect** returns once +all the subdaemons are in place to handle the channels, not just once +it's connected. + RETURN VALUE ------------ diff --git a/lightningd/peer_control.c b/lightningd/peer_control.c index 022c0cf8c..b1968cfa2 100644 --- a/lightningd/peer_control.c +++ b/lightningd/peer_control.c @@ -997,23 +997,7 @@ static void peer_connected_hook_final(struct peer_connected_hook_payload *payloa } #endif - switch (channel->state) { - case CLOSED: - /* Channel should not have been loaded */ - abort(); - case ONCHAIN: - case FUNDING_SPEND_SEEN: - case CLOSINGD_COMPLETE: - case AWAITING_UNILATERAL: - /* We don't send anything here; if they talk about - * this channel they'll get an error. */ - continue; - case DUALOPEND_OPEN_INIT: - case DUALOPEND_AWAITING_LOCKIN: - case CHANNELD_AWAITING_LOCKIN: - case CHANNELD_NORMAL: - case CHANNELD_SHUTTING_DOWN: - case CLOSINGD_SIGEXCHANGE: + if (channel_active(channel)) { log_debug(channel->log, "Peer has reconnected, state %s: telling connectd to make active", channel_state_name(channel)); @@ -1021,11 +1005,7 @@ static void peer_connected_hook_final(struct peer_connected_hook_payload *payloa subd_send_msg(ld->connectd, take(towire_connectd_peer_make_active(NULL, &peer->id, &channel->cid))); - continue; } - - /* Oops, channel->state is corrupted? */ - abort(); } return; @@ -1149,6 +1129,20 @@ REGISTER_PLUGIN_HOOK(peer_connected, peer_connected_serialize, struct peer_connected_hook_payload *); +/* Returns true if we're still waiting for subds for active channels */ +static bool peer_subds_pending(const struct peer *peer) +{ + struct channel *channel; + + list_for_each(&peer->channels, channel, list) { + if (!channel_active(channel)) + continue; + if (!channel->owner) + return true; + } + return false; +} + /* Connectd tells us a peer has connected: it never hands us duplicates, since * it holds them until we say peer_disconnected. */ void peer_connected(struct lightningd *ld, const u8 *msg) @@ -1184,8 +1178,11 @@ void peer_connected(struct lightningd *ld, const u8 *msg) tal_steal(peer, hook_payload); hook_payload->peer = peer; - /* Complete any outstanding connect commands. */ - connect_succeeded(ld, peer, hook_payload->incoming, &hook_payload->addr); + /* Complete any outstanding connect commands: as a hack, we delay here if we + * are going to make them active (so when connect returns, the channels are ready). + * So we also wake these up if the connection dies before that! */ + if (!peer_subds_pending(peer)) + connect_succeeded(ld, peer, hook_payload->incoming, &hook_payload->addr); /* Can't be opening, since we wouldn't have sent peer_disconnected. */ assert(!peer->uncommitted_channel); @@ -1217,7 +1214,6 @@ void peer_active(struct lightningd *ld, const u8 *msg, int fd) u8 *error; struct peer_fd *peer_fd = new_peer_fd(tmpctx, fd); - /* FIXME: Use msgtype to determine what to do! */ if (!fromwire_connectd_peer_active(msg, msg, &id, &msgtype, &channel_id)) fatal("Connectd gave bad CONNECTD_PEER_ACTIVE message %s", tal_hex(msg, msg)); @@ -1261,13 +1257,13 @@ void peer_active(struct lightningd *ld, const u8 *msg, int fd) && channel->open_attempt->open_msg) { if (peer_start_dualopend(peer, peer_fd, channel)) subd_send_msg(channel->owner, channel->open_attempt->open_msg); - return; + goto subd_setup_done; } /* Fall through. */ case DUALOPEND_AWAITING_LOCKIN: assert(!channel->owner); peer_restart_dualopend(peer, peer_fd, channel); - return; + goto subd_setup_done; case CHANNELD_AWAITING_LOCKIN: case CHANNELD_NORMAL: case CHANNELD_SHUTTING_DOWN: @@ -1277,7 +1273,7 @@ void peer_active(struct lightningd *ld, const u8 *msg, int fd) peer_fd, NULL, true, NULL); - return; + goto subd_setup_done; } abort(); } @@ -1377,6 +1373,13 @@ send_error: subd_send_msg(ld->connectd, take(towire_connectd_peer_final_msg(NULL, &peer->id, error))); + return; + +subd_setup_done: + /* We deferred connect_succeeded to here, so subd would be ready once + * `connect` returns. */ + if (!peer_subds_pending(peer)) + connect_succeeded(ld, peer, peer->connected_incoming, &peer->addr); } struct disconnect_command {