lightningd: drive all reconnections out of disconnections.

The only places which should call try_reconnect now are the "connect" command, and the disconnect path when it decides there's still an active channel. This introduces one subtlety: if we disconnect when there's no active channel, but then the subd makes one, we have to catch that case! This temporarily reverts "slow" reconnections to fast ones: see next patch. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-02-21 22:31:48 +01:00 · 2022-07-18 21:42:28 +09:30 · 2022-07-18 21:42:28 +09:30 · 02e169fd27
commit 02e169fd27
parent a3c4908f4a
12 changed files with 98 additions and 64 deletions
--- a/lightningd/channel.c
+++ b/lightningd/channel.c
@ -929,9 +929,9 @@ void channel_set_billboard(struct channel *channel, bool perm, const char *str)
 	}
 }

-static void err_and_reconnect(struct channel *channel,
-			      const char *why,
-			      u32 seconds_before_reconnect)
+static void channel_err(struct channel *channel,
+			const char *why,
+			u32 seconds_before_reconnect /* FIXME: use this! */)
 {
 	log_info(channel->log, "Peer transient failure in %s: %s",
 		 channel_state_name(channel), why);
@ -946,29 +946,23 @@ static void err_and_reconnect(struct channel *channel,
 #endif

 	channel_set_owner(channel, NULL);
-
-	/* Their address only useful if we connected to them */
-	try_reconnect(channel, channel->peer, seconds_before_reconnect,
-		      channel->peer->connected_incoming
-		      ? NULL
-		      : &channel->peer->addr);
 }

-void channel_fail_reconnect_later(struct channel *channel, const char *fmt, ...)
+void channel_fail_transient_delayreconnect(struct channel *channel, const char *fmt, ...)
 {
 	va_list ap;

 	va_start(ap, fmt);
-	err_and_reconnect(channel, tal_vfmt(tmpctx, fmt, ap), 60);
+	channel_err(channel, tal_vfmt(tmpctx, fmt, ap), 60);
 	va_end(ap);
 }

-void channel_fail_reconnect(struct channel *channel, const char *fmt, ...)
+void channel_fail_transient(struct channel *channel, const char *fmt, ...)
 {
 	va_list ap;

 	va_start(ap, fmt);
-	err_and_reconnect(channel, tal_vfmt(tmpctx, fmt, ap), 1);
+	channel_err(channel, tal_vfmt(tmpctx, fmt, ap), 1);
 	va_end(ap);
 }

--- a/lightningd/channel.h
+++ b/lightningd/channel.h
@ -374,11 +374,11 @@ const char *channel_state_str(enum channel_state state);
 void channel_set_owner(struct channel *channel, struct subd *owner);

 /* Channel has failed, but can try again. */
-void channel_fail_reconnect(struct channel *channel,
+void channel_fail_transient(struct channel *channel,
 			    const char *fmt, ...) PRINTF_FMT(2,3);
 /* Channel has failed, but can try again after a minute. */
-void channel_fail_reconnect_later(struct channel *channel,
-				  const char *fmt,...) PRINTF_FMT(2,3);
+void channel_fail_transient_delayreconnect(struct channel *channel,
+					   const char *fmt,...) PRINTF_FMT(2,3);

 /* Channel has failed, give up on it. */
 void channel_fail_permanent(struct channel *channel,
--- a/lightningd/channel_control.c
+++ b/lightningd/channel_control.c
@ -319,7 +319,7 @@ static void peer_got_shutdown(struct channel *channel, const u8 *msg)
 								  &channel->peer->id,
 								  channel->peer->connectd_counter,
 								  warning)));
-		channel_fail_reconnect(channel, "Bad shutdown scriptpubkey %s",
+		channel_fail_transient(channel, "Bad shutdown scriptpubkey %s",
 				       tal_hex(tmpctx, scriptpubkey));
 		return;
 	}
@ -638,8 +638,10 @@ bool peer_start_channeld(struct channel *channel,
 	if (!channel->owner) {
 		log_broken(channel->log, "Could not subdaemon channel: %s",
 			   strerror(errno));
-		channel_fail_reconnect_later(channel,
-					     "Failed to subdaemon channel");
+		/* Disconnect it. */
+		subd_send_msg(ld->connectd,
+			      take(towire_connectd_discard_peer(NULL, &channel->peer->id,
+								channel->peer->connectd_counter)));
 		return false;
 	}

--- a/lightningd/closing_control.c
+++ b/lightningd/closing_control.c
@ -17,6 +17,7 @@
 #include <common/timeout.h>
 #include <common/type_to_string.h>
 #include <common/utils.h>
+#include <connectd/connectd_wiregen.h>
 #include <errno.h>
 #include <gossipd/gossipd_wiregen.h>
 #include <hsmd/capabilities.h>
@ -377,8 +378,10 @@ void peer_start_closingd(struct channel *channel, struct peer_fd *peer_fd)
 	if (!channel->owner) {
 		log_broken(channel->log, "Could not subdaemon closing: %s",
 			    strerror(errno));
-		channel_fail_reconnect_later(channel,
-					     "Failed to subdaemon closing");
+		/* Disconnect it. */
+		subd_send_msg(ld->connectd,
+			      take(towire_connectd_discard_peer(NULL, &channel->peer->id,
+								channel->peer->connectd_counter)));
 		return;
 	}

--- a/lightningd/dual_open_control.c
+++ b/lightningd/dual_open_control.c
@ -21,6 +21,7 @@
 #include <lightningd/channel.h>
 #include <lightningd/channel_control.h>
 #include <lightningd/closing_control.h>
+#include <lightningd/connect_control.h>
 #include <lightningd/dual_open_control.h>
 #include <lightningd/gossip_control.h>
 #include <lightningd/hsm_control.h>
@ -46,14 +47,11 @@ static void channel_disconnect(struct channel *channel,
 	log_(channel->log, level, NULL, false, "%s", desc);
 	channel_cleanup_commands(channel, desc);

-	if (!reconnect)
-		channel_set_owner(channel, NULL);
-	else
-		channel_fail_reconnect(channel, "%s: %s",
-				       channel->owner ?
-						channel->owner->name :
-						"dualopend-dead",
-				       desc);
+	channel_fail_transient(channel, "%s: %s",
+			       channel->owner ?
+			       channel->owner->name :
+			       "dualopend-dead",
+			       desc);
 }

 void channel_unsaved_close_conn(struct channel *channel, const char *why)
@ -1179,6 +1177,7 @@ wallet_commit_channel(struct lightningd *ld,
 {
 	struct amount_msat our_msat, lease_fee_msat;
 	struct channel_inflight *inflight;
+	bool any_active = peer_any_active_channel(channel->peer, NULL);

 	if (!amount_sat_to_msat(&our_msat, our_funding)) {
 		log_broken(channel->log, "Unable to convert funds");
@ -1292,6 +1291,14 @@ wallet_commit_channel(struct lightningd *ld,
 				channel->push);
 	wallet_inflight_add(ld->wallet, inflight);

+	/* We might have disconnected and decided we didn't need to
+	 * reconnect because no channels are active.  But the subd
+	 * just made it active! */
+	if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
+		try_reconnect(channel->peer, channel->peer, 1,
+			      &channel->peer->addr);
+	}
+
 	return inflight;
 }

@ -1348,13 +1355,13 @@ static void handle_peer_wants_to_close(struct subd *dualopend,
 						"Bad shutdown scriptpubkey %s",
 						tal_hex(tmpctx, scriptpubkey));

-		/* Get connectd to send warning, and then allow reconnect. */
+		/* Get connectd to send warning, and kill subd. */
 		subd_send_msg(ld->connectd,
 			      take(towire_connectd_peer_final_msg(NULL,
 								  &channel->peer->id,
 								  channel->peer->connectd_counter,
 								  warning)));
-		channel_fail_reconnect(channel, "Bad shutdown scriptpubkey %s",
+		channel_fail_transient(channel, "Bad shutdown scriptpubkey %s",
 				       tal_hex(tmpctx, scriptpubkey));
 		return;
 	}
@ -3408,8 +3415,10 @@ bool peer_restart_dualopend(struct peer *peer,
 	if (!channel->owner) {
 		log_broken(channel->log, "Could not subdaemon channel: %s",
 			   strerror(errno));
-		channel_fail_reconnect_later(channel,
-					     "Failed to subdaemon channel");
+		/* Disconnect it. */
+		subd_send_msg(peer->ld->connectd,
+			      take(towire_connectd_discard_peer(NULL, &channel->peer->id,
+								channel->peer->connectd_counter)));
 		return false;
 	}

--- a/lightningd/opening_control.c
+++ b/lightningd/opening_control.c
@ -19,6 +19,7 @@
 #include <lightningd/chaintopology.h>
 #include <lightningd/channel.h>
 #include <lightningd/channel_control.h>
+#include <lightningd/connect_control.h>
 #include <lightningd/hsm_control.h>
 #include <lightningd/notification.h>
 #include <lightningd/opening_common.h>
@ -100,6 +101,7 @@ wallet_commit_channel(struct lightningd *ld,
 	u32 lease_start_blockheight = 0; /* No leases on v1 */
 	struct short_channel_id *alias_local;
 	struct timeabs timestamp;
+	bool any_active = peer_any_active_channel(uc->peer, NULL);

 	/* We cannot both be the fundee *and* have a `fundchannel_start`
 	 * command running!
@ -233,6 +235,15 @@ wallet_commit_channel(struct lightningd *ld,
 				     channel->state_change_cause,
 				     "new channel opened");

+
+	/* We might have disconnected and decided we didn't need to
+	 * reconnect because no channels are active.  But the subd
+	 * just made it active! */
+	if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
+		try_reconnect(channel->peer, channel->peer, 1,
+			      &channel->peer->addr);
+	}
+
 	return channel;
 }

--- a/lightningd/peer_control.c
+++ b/lightningd/peer_control.c
@ -166,7 +166,7 @@ static void peer_channels_cleanup(struct lightningd *ld,
 		c = channels[i];
 		if (channel_active(c)) {
 			channel_cleanup_commands(c, "Disconnected");
-			channel_fail_reconnect(c, "Disconnected");
+			channel_fail_transient(c, "Disconnected");
 		} else if (channel_unsaved(c)) {
 			channel_unsaved_close_conn(c, "Disconnected");
 		}
@ -357,7 +357,7 @@ void channel_errmsg(struct channel *channel,
 	/* No peer_fd means a subd crash or disconnection. */
 	if (!peer_fd) {
 		/* If the channel is unsaved, we forget it */
-		channel_fail_reconnect(channel, "%s: %s",
+		channel_fail_transient(channel, "%s: %s",
 				       channel->owner->name, desc);
 		return;
 	}
@ -371,8 +371,8 @@ void channel_errmsg(struct channel *channel,
 	 * and we would close the channel on them.  We now support warnings
 	 * for this case. */
 	if (warning) {
-		channel_fail_reconnect_later(channel, "%s WARNING: %s",
-					     channel->owner->name, desc);
+		channel_fail_transient_delayreconnect(channel, "%s WARNING: %s",
+						      channel->owner->name, desc);
 		return;
 	}

@ -1731,9 +1731,21 @@ static enum watch_result funding_depth_cb(struct lightningd *ld,

 		} else if (!short_channel_id_eq(channel->scid, &scid) &&
 			   !is_stub_scid(channel->scid)) {
-			/* This normally restarts channeld, initialized with updated scid
+			/* Send warning: that will make connectd disconnect, and then we'll
+			 * try to reconnect. */
+			u8 *warning = towire_warningfmt(tmpctx, &channel->cid,
+							"short_channel_id changed to %s (was %s)",
+							short_channel_id_to_str(tmpctx, &scid),
+							short_channel_id_to_str(tmpctx, channel->scid));
+			if (channel->peer->connected != PEER_DISCONNECTED)
+				subd_send_msg(ld->connectd,
+					      take(towire_connectd_peer_final_msg(NULL,
+										  &channel->peer->id,
+										  channel->peer->connectd_counter,
+										  warning)));
+			/* When we restart channeld, it will be initialized with updated scid
 			 * and also adds it (at least our halve_chan) to rtable. */
-			channel_fail_reconnect(channel,
+			channel_fail_transient_delayreconnect(channel,
 					       "short_channel_id changed to %s (was %s)",
 					       short_channel_id_to_str(tmpctx, &scid),
 					       short_channel_id_to_str(tmpctx, channel->scid));
--- a/lightningd/test/run-invoice-select-inchan.c
+++ b/lightningd/test/run-invoice-select-inchan.c
@ -68,14 +68,14 @@ void channel_fail_permanent(struct channel *channel UNNEEDED,
 			    const char *fmt UNNEEDED,
 			    ...)
 { fprintf(stderr, "channel_fail_permanent called!\n"); abort(); }
-/* Generated stub for channel_fail_reconnect */
-void channel_fail_reconnect(struct channel *channel UNNEEDED,
+/* Generated stub for channel_fail_transient */
+void channel_fail_transient(struct channel *channel UNNEEDED,
 			    const char *fmt UNNEEDED, ...)
-{ fprintf(stderr, "channel_fail_reconnect called!\n"); abort(); }
-/* Generated stub for channel_fail_reconnect_later */
-void channel_fail_reconnect_later(struct channel *channel UNNEEDED,
-				  const char *fmt UNNEEDED,...)
-{ fprintf(stderr, "channel_fail_reconnect_later called!\n"); abort(); }
+{ fprintf(stderr, "channel_fail_transient called!\n"); abort(); }
+/* Generated stub for channel_fail_transient_delayreconnect */
+void channel_fail_transient_delayreconnect(struct channel *channel UNNEEDED,
+					   const char *fmt UNNEEDED,...)
+{ fprintf(stderr, "channel_fail_transient_delayreconnect called!\n"); abort(); }
 /* Generated stub for channel_has_htlc_in */
 struct htlc_in *channel_has_htlc_in(struct channel *channel UNNEEDED)
 { fprintf(stderr, "channel_has_htlc_in called!\n"); abort(); }
--- a/tests/test_closing.py
+++ b/tests/test_closing.py
@ -3667,8 +3667,7 @@ We send an HTLC, and peer unilaterally closes: do we close upstream?
    with pytest.raises(RpcError, match=r'WIRE_TEMPORARY_CHANNEL_FAILURE \(reply from remote\)'):
        l1.rpc.waitsendpay(ph2, timeout=TIMEOUT)

-    # l3 closes unilaterally.
-    wait_for(lambda: only_one(l3.rpc.listpeers(l2.info['id'])['peers'])['connected'] is False)
+    # Make close unilaterally.
    l3.rpc.close(l2.info['id'], 1)

    l3.daemon.wait_for_log('sendrawtransaction')
--- a/tests/test_connection.py
+++ b/tests/test_connection.py
@ -451,7 +451,8 @@ def test_disconnect_opener(node_factory):
        l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
        with pytest.raises(RpcError):
            l1.rpc.fundchannel(l2.info['id'], 25000)
-        assert l1.rpc.getpeer(l2.info['id']) is None
+        # First peer valishes, but later it just disconnects
+        wait_for(lambda: all([p['connected'] is False for p in l1.rpc.listpeers()['peers']]))

    # This one will succeed.
    l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
@ -495,7 +496,8 @@ def test_disconnect_fundee(node_factory):
        l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
        with pytest.raises(RpcError):
            l1.rpc.fundchannel(l2.info['id'], 25000)
-        assert l1.rpc.getpeer(l2.info['id']) is None
+        # First peer valishes, but later it just disconnects
+        wait_for(lambda: all([p['connected'] is False for p in l1.rpc.listpeers()['peers']]))

    # This one will succeed.
    l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
@ -541,8 +543,8 @@ def test_disconnect_fundee_v2(node_factory):
    l1.rpc.fundchannel(l2.info['id'], 25000)

    # Should still only have one peer!
-    assert len(l1.rpc.listpeers()) == 1
-    assert len(l2.rpc.listpeers()) == 1
+    assert len(l1.rpc.listpeers()['peers']) == 1
+    assert len(l2.rpc.listpeers()['peers']) == 1


@pytest.mark.developer
@ -564,8 +566,8 @@ def test_disconnect_half_signed(node_factory):
        l1.rpc.fundchannel(l2.info['id'], 25000)

    # Peer remembers, opener doesn't.
-    assert l1.rpc.getpeer(l2.info['id']) is None
-    assert l2.rpc.getpeer(l1.info['id'])['id'] == l1.info['id']
+    wait_for(lambda: l1.rpc.listpeers(l2.info['id'])['peers'] == [])
+    assert len(only_one(l2.rpc.listpeers(l1.info['id'])['peers'])['channels']) == 1


@pytest.mark.developer
@ -3606,7 +3608,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
    l2.wait_for_onchaind_broadcast('OUR_PENALTY_TX',
                                   'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM')
    bitcoind.generate_block(100)
-    wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
+    # This works even if they disconnect and listpeers() is empty:
+    wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))

    # TEST 2: Cheat from post-upgrade.
    node_factory.join_nodes([l1, l2])
@ -3630,7 +3633,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
    l2.wait_for_onchaind_broadcast('OUR_PENALTY_TX',
                                   'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM')
    bitcoind.generate_block(100)
-    wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
+    # This works even if they disconnect and listpeers() is empty:
+    wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))

    # TEST 3: Unilateral close from pre-upgrade
    node_factory.join_nodes([l1, l2])
@ -3658,7 +3662,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
    bitcoind.generate_block(5)
    bitcoind.generate_block(100, wait_for_mempool=1)

-    wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
+    # This works even if they disconnect and listpeers() is empty:
+    wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))

    # TEST 4: Unilateral close from post-upgrade
    node_factory.join_nodes([l1, l2])
@ -3683,7 +3688,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
    bitcoind.generate_block(5)
    bitcoind.generate_block(100, wait_for_mempool=1)

-    wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
+    # This works even if they disconnect and listpeers() is empty:
+    wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))


@unittest.skipIf(not EXPERIMENTAL_FEATURES, "upgrade protocol not available")
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@ -1095,7 +1095,8 @@ def test_funding_reorg_private(node_factory, bitcoind):
    opts = {'funding-confirms': 2, 'rescan': 10, 'may_reconnect': True,
            'allow_bad_gossip': True,
            # gossipd send lightning update for original channel.
-            'allow_broken_log': True}
+            'allow_broken_log': True,
+            'allow_warning': True}
    l1, l2 = node_factory.line_graph(2, fundchannel=False, opts=opts)
    l1.fundwallet(10000000)
    sync_blockheight(bitcoind, [l1])                # height 102
@ -1138,7 +1139,8 @@ def test_funding_reorg_remote_lags(node_factory, bitcoind):
    """Nodes may disagree about short_channel_id before channel announcement
    """
    # may_reconnect so channeld will restart; bad gossip can happen due to reorg
-    opts = {'funding-confirms': 1, 'may_reconnect': True, 'allow_bad_gossip': True}
+    opts = {'funding-confirms': 1, 'may_reconnect': True, 'allow_bad_gossip': True,
+            'allow_warning': True}
    l1, l2 = node_factory.line_graph(2, fundchannel=False, opts=opts)
    l1.fundwallet(10000000)
    sync_blockheight(bitcoind, [l1])                # height 102
--- a/tests/test_opening.py
+++ b/tests/test_opening.py
@ -129,10 +129,6 @@ def test_multifunding_v2_best_effort(node_factory, bitcoind):
        for node in node_list:
            node.daemon.wait_for_log(r'to CLOSINGD_COMPLETE')

-        # Make sure disconnections are complete
-        if not failed_sign:
-            wait_for(lambda: all([c['connected'] is False for c in l1.rpc.listpeers()['peers']]))
-
    # With 2 down, it will fail to fund channel
    l2.stop()
    l3.stop()