lightningd: drive all reconnections out of disconnections.

The only places which should call try_reconnect now are the "connect"
command, and the disconnect path when it decides there's still an
active channel.

This introduces one subtlety: if we disconnect when there's no active
channel, but then the subd makes one, we have to catch that case!

This temporarily reverts "slow" reconnections to fast ones: see next
patch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell 2022-07-18 21:42:28 +09:30 committed by neil saitug
parent a3c4908f4a
commit 02e169fd27
12 changed files with 98 additions and 64 deletions

View file

@ -929,9 +929,9 @@ void channel_set_billboard(struct channel *channel, bool perm, const char *str)
}
}
static void err_and_reconnect(struct channel *channel,
const char *why,
u32 seconds_before_reconnect)
static void channel_err(struct channel *channel,
const char *why,
u32 seconds_before_reconnect /* FIXME: use this! */)
{
log_info(channel->log, "Peer transient failure in %s: %s",
channel_state_name(channel), why);
@ -946,29 +946,23 @@ static void err_and_reconnect(struct channel *channel,
#endif
channel_set_owner(channel, NULL);
/* Their address only useful if we connected to them */
try_reconnect(channel, channel->peer, seconds_before_reconnect,
channel->peer->connected_incoming
? NULL
: &channel->peer->addr);
}
void channel_fail_reconnect_later(struct channel *channel, const char *fmt, ...)
void channel_fail_transient_delayreconnect(struct channel *channel, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
err_and_reconnect(channel, tal_vfmt(tmpctx, fmt, ap), 60);
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), 60);
va_end(ap);
}
void channel_fail_reconnect(struct channel *channel, const char *fmt, ...)
void channel_fail_transient(struct channel *channel, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
err_and_reconnect(channel, tal_vfmt(tmpctx, fmt, ap), 1);
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), 1);
va_end(ap);
}

View file

@ -374,11 +374,11 @@ const char *channel_state_str(enum channel_state state);
void channel_set_owner(struct channel *channel, struct subd *owner);
/* Channel has failed, but can try again. */
void channel_fail_reconnect(struct channel *channel,
void channel_fail_transient(struct channel *channel,
const char *fmt, ...) PRINTF_FMT(2,3);
/* Channel has failed, but can try again after a minute. */
void channel_fail_reconnect_later(struct channel *channel,
const char *fmt,...) PRINTF_FMT(2,3);
void channel_fail_transient_delayreconnect(struct channel *channel,
const char *fmt,...) PRINTF_FMT(2,3);
/* Channel has failed, give up on it. */
void channel_fail_permanent(struct channel *channel,

View file

@ -319,7 +319,7 @@ static void peer_got_shutdown(struct channel *channel, const u8 *msg)
&channel->peer->id,
channel->peer->connectd_counter,
warning)));
channel_fail_reconnect(channel, "Bad shutdown scriptpubkey %s",
channel_fail_transient(channel, "Bad shutdown scriptpubkey %s",
tal_hex(tmpctx, scriptpubkey));
return;
}
@ -638,8 +638,10 @@ bool peer_start_channeld(struct channel *channel,
if (!channel->owner) {
log_broken(channel->log, "Could not subdaemon channel: %s",
strerror(errno));
channel_fail_reconnect_later(channel,
"Failed to subdaemon channel");
/* Disconnect it. */
subd_send_msg(ld->connectd,
take(towire_connectd_discard_peer(NULL, &channel->peer->id,
channel->peer->connectd_counter)));
return false;
}

View file

@ -17,6 +17,7 @@
#include <common/timeout.h>
#include <common/type_to_string.h>
#include <common/utils.h>
#include <connectd/connectd_wiregen.h>
#include <errno.h>
#include <gossipd/gossipd_wiregen.h>
#include <hsmd/capabilities.h>
@ -377,8 +378,10 @@ void peer_start_closingd(struct channel *channel, struct peer_fd *peer_fd)
if (!channel->owner) {
log_broken(channel->log, "Could not subdaemon closing: %s",
strerror(errno));
channel_fail_reconnect_later(channel,
"Failed to subdaemon closing");
/* Disconnect it. */
subd_send_msg(ld->connectd,
take(towire_connectd_discard_peer(NULL, &channel->peer->id,
channel->peer->connectd_counter)));
return;
}

View file

@ -21,6 +21,7 @@
#include <lightningd/channel.h>
#include <lightningd/channel_control.h>
#include <lightningd/closing_control.h>
#include <lightningd/connect_control.h>
#include <lightningd/dual_open_control.h>
#include <lightningd/gossip_control.h>
#include <lightningd/hsm_control.h>
@ -46,14 +47,11 @@ static void channel_disconnect(struct channel *channel,
log_(channel->log, level, NULL, false, "%s", desc);
channel_cleanup_commands(channel, desc);
if (!reconnect)
channel_set_owner(channel, NULL);
else
channel_fail_reconnect(channel, "%s: %s",
channel->owner ?
channel->owner->name :
"dualopend-dead",
desc);
channel_fail_transient(channel, "%s: %s",
channel->owner ?
channel->owner->name :
"dualopend-dead",
desc);
}
void channel_unsaved_close_conn(struct channel *channel, const char *why)
@ -1179,6 +1177,7 @@ wallet_commit_channel(struct lightningd *ld,
{
struct amount_msat our_msat, lease_fee_msat;
struct channel_inflight *inflight;
bool any_active = peer_any_active_channel(channel->peer, NULL);
if (!amount_sat_to_msat(&our_msat, our_funding)) {
log_broken(channel->log, "Unable to convert funds");
@ -1292,6 +1291,14 @@ wallet_commit_channel(struct lightningd *ld,
channel->push);
wallet_inflight_add(ld->wallet, inflight);
/* We might have disconnected and decided we didn't need to
* reconnect because no channels are active. But the subd
* just made it active! */
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
try_reconnect(channel->peer, channel->peer, 1,
&channel->peer->addr);
}
return inflight;
}
@ -1348,13 +1355,13 @@ static void handle_peer_wants_to_close(struct subd *dualopend,
"Bad shutdown scriptpubkey %s",
tal_hex(tmpctx, scriptpubkey));
/* Get connectd to send warning, and then allow reconnect. */
/* Get connectd to send warning, and kill subd. */
subd_send_msg(ld->connectd,
take(towire_connectd_peer_final_msg(NULL,
&channel->peer->id,
channel->peer->connectd_counter,
warning)));
channel_fail_reconnect(channel, "Bad shutdown scriptpubkey %s",
channel_fail_transient(channel, "Bad shutdown scriptpubkey %s",
tal_hex(tmpctx, scriptpubkey));
return;
}
@ -3408,8 +3415,10 @@ bool peer_restart_dualopend(struct peer *peer,
if (!channel->owner) {
log_broken(channel->log, "Could not subdaemon channel: %s",
strerror(errno));
channel_fail_reconnect_later(channel,
"Failed to subdaemon channel");
/* Disconnect it. */
subd_send_msg(peer->ld->connectd,
take(towire_connectd_discard_peer(NULL, &channel->peer->id,
channel->peer->connectd_counter)));
return false;
}

View file

@ -19,6 +19,7 @@
#include <lightningd/chaintopology.h>
#include <lightningd/channel.h>
#include <lightningd/channel_control.h>
#include <lightningd/connect_control.h>
#include <lightningd/hsm_control.h>
#include <lightningd/notification.h>
#include <lightningd/opening_common.h>
@ -100,6 +101,7 @@ wallet_commit_channel(struct lightningd *ld,
u32 lease_start_blockheight = 0; /* No leases on v1 */
struct short_channel_id *alias_local;
struct timeabs timestamp;
bool any_active = peer_any_active_channel(uc->peer, NULL);
/* We cannot both be the fundee *and* have a `fundchannel_start`
* command running!
@ -233,6 +235,15 @@ wallet_commit_channel(struct lightningd *ld,
channel->state_change_cause,
"new channel opened");
/* We might have disconnected and decided we didn't need to
* reconnect because no channels are active. But the subd
* just made it active! */
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
try_reconnect(channel->peer, channel->peer, 1,
&channel->peer->addr);
}
return channel;
}

View file

@ -166,7 +166,7 @@ static void peer_channels_cleanup(struct lightningd *ld,
c = channels[i];
if (channel_active(c)) {
channel_cleanup_commands(c, "Disconnected");
channel_fail_reconnect(c, "Disconnected");
channel_fail_transient(c, "Disconnected");
} else if (channel_unsaved(c)) {
channel_unsaved_close_conn(c, "Disconnected");
}
@ -357,7 +357,7 @@ void channel_errmsg(struct channel *channel,
/* No peer_fd means a subd crash or disconnection. */
if (!peer_fd) {
/* If the channel is unsaved, we forget it */
channel_fail_reconnect(channel, "%s: %s",
channel_fail_transient(channel, "%s: %s",
channel->owner->name, desc);
return;
}
@ -371,8 +371,8 @@ void channel_errmsg(struct channel *channel,
* and we would close the channel on them. We now support warnings
* for this case. */
if (warning) {
channel_fail_reconnect_later(channel, "%s WARNING: %s",
channel->owner->name, desc);
channel_fail_transient_delayreconnect(channel, "%s WARNING: %s",
channel->owner->name, desc);
return;
}
@ -1731,9 +1731,21 @@ static enum watch_result funding_depth_cb(struct lightningd *ld,
} else if (!short_channel_id_eq(channel->scid, &scid) &&
!is_stub_scid(channel->scid)) {
/* This normally restarts channeld, initialized with updated scid
/* Send warning: that will make connectd disconnect, and then we'll
* try to reconnect. */
u8 *warning = towire_warningfmt(tmpctx, &channel->cid,
"short_channel_id changed to %s (was %s)",
short_channel_id_to_str(tmpctx, &scid),
short_channel_id_to_str(tmpctx, channel->scid));
if (channel->peer->connected != PEER_DISCONNECTED)
subd_send_msg(ld->connectd,
take(towire_connectd_peer_final_msg(NULL,
&channel->peer->id,
channel->peer->connectd_counter,
warning)));
/* When we restart channeld, it will be initialized with updated scid
* and also adds it (at least our halve_chan) to rtable. */
channel_fail_reconnect(channel,
channel_fail_transient_delayreconnect(channel,
"short_channel_id changed to %s (was %s)",
short_channel_id_to_str(tmpctx, &scid),
short_channel_id_to_str(tmpctx, channel->scid));

View file

@ -68,14 +68,14 @@ void channel_fail_permanent(struct channel *channel UNNEEDED,
const char *fmt UNNEEDED,
...)
{ fprintf(stderr, "channel_fail_permanent called!\n"); abort(); }
/* Generated stub for channel_fail_reconnect */
void channel_fail_reconnect(struct channel *channel UNNEEDED,
/* Generated stub for channel_fail_transient */
void channel_fail_transient(struct channel *channel UNNEEDED,
const char *fmt UNNEEDED, ...)
{ fprintf(stderr, "channel_fail_reconnect called!\n"); abort(); }
/* Generated stub for channel_fail_reconnect_later */
void channel_fail_reconnect_later(struct channel *channel UNNEEDED,
const char *fmt UNNEEDED,...)
{ fprintf(stderr, "channel_fail_reconnect_later called!\n"); abort(); }
{ fprintf(stderr, "channel_fail_transient called!\n"); abort(); }
/* Generated stub for channel_fail_transient_delayreconnect */
void channel_fail_transient_delayreconnect(struct channel *channel UNNEEDED,
const char *fmt UNNEEDED,...)
{ fprintf(stderr, "channel_fail_transient_delayreconnect called!\n"); abort(); }
/* Generated stub for channel_has_htlc_in */
struct htlc_in *channel_has_htlc_in(struct channel *channel UNNEEDED)
{ fprintf(stderr, "channel_has_htlc_in called!\n"); abort(); }

View file

@ -3667,8 +3667,7 @@ We send an HTLC, and peer unilaterally closes: do we close upstream?
with pytest.raises(RpcError, match=r'WIRE_TEMPORARY_CHANNEL_FAILURE \(reply from remote\)'):
l1.rpc.waitsendpay(ph2, timeout=TIMEOUT)
# l3 closes unilaterally.
wait_for(lambda: only_one(l3.rpc.listpeers(l2.info['id'])['peers'])['connected'] is False)
# Make close unilaterally.
l3.rpc.close(l2.info['id'], 1)
l3.daemon.wait_for_log('sendrawtransaction')

View file

@ -451,7 +451,8 @@ def test_disconnect_opener(node_factory):
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
with pytest.raises(RpcError):
l1.rpc.fundchannel(l2.info['id'], 25000)
assert l1.rpc.getpeer(l2.info['id']) is None
# First peer valishes, but later it just disconnects
wait_for(lambda: all([p['connected'] is False for p in l1.rpc.listpeers()['peers']]))
# This one will succeed.
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
@ -495,7 +496,8 @@ def test_disconnect_fundee(node_factory):
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
with pytest.raises(RpcError):
l1.rpc.fundchannel(l2.info['id'], 25000)
assert l1.rpc.getpeer(l2.info['id']) is None
# First peer valishes, but later it just disconnects
wait_for(lambda: all([p['connected'] is False for p in l1.rpc.listpeers()['peers']]))
# This one will succeed.
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
@ -541,8 +543,8 @@ def test_disconnect_fundee_v2(node_factory):
l1.rpc.fundchannel(l2.info['id'], 25000)
# Should still only have one peer!
assert len(l1.rpc.listpeers()) == 1
assert len(l2.rpc.listpeers()) == 1
assert len(l1.rpc.listpeers()['peers']) == 1
assert len(l2.rpc.listpeers()['peers']) == 1
@pytest.mark.developer
@ -564,8 +566,8 @@ def test_disconnect_half_signed(node_factory):
l1.rpc.fundchannel(l2.info['id'], 25000)
# Peer remembers, opener doesn't.
assert l1.rpc.getpeer(l2.info['id']) is None
assert l2.rpc.getpeer(l1.info['id'])['id'] == l1.info['id']
wait_for(lambda: l1.rpc.listpeers(l2.info['id'])['peers'] == [])
assert len(only_one(l2.rpc.listpeers(l1.info['id'])['peers'])['channels']) == 1
@pytest.mark.developer
@ -3606,7 +3608,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
l2.wait_for_onchaind_broadcast('OUR_PENALTY_TX',
'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM')
bitcoind.generate_block(100)
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
# This works even if they disconnect and listpeers() is empty:
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))
# TEST 2: Cheat from post-upgrade.
node_factory.join_nodes([l1, l2])
@ -3630,7 +3633,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
l2.wait_for_onchaind_broadcast('OUR_PENALTY_TX',
'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM')
bitcoind.generate_block(100)
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
# This works even if they disconnect and listpeers() is empty:
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))
# TEST 3: Unilateral close from pre-upgrade
node_factory.join_nodes([l1, l2])
@ -3658,7 +3662,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
bitcoind.generate_block(5)
bitcoind.generate_block(100, wait_for_mempool=1)
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
# This works even if they disconnect and listpeers() is empty:
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))
# TEST 4: Unilateral close from post-upgrade
node_factory.join_nodes([l1, l2])
@ -3683,7 +3688,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
bitcoind.generate_block(5)
bitcoind.generate_block(100, wait_for_mempool=1)
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
# This works even if they disconnect and listpeers() is empty:
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))
@unittest.skipIf(not EXPERIMENTAL_FEATURES, "upgrade protocol not available")

View file

@ -1095,7 +1095,8 @@ def test_funding_reorg_private(node_factory, bitcoind):
opts = {'funding-confirms': 2, 'rescan': 10, 'may_reconnect': True,
'allow_bad_gossip': True,
# gossipd send lightning update for original channel.
'allow_broken_log': True}
'allow_broken_log': True,
'allow_warning': True}
l1, l2 = node_factory.line_graph(2, fundchannel=False, opts=opts)
l1.fundwallet(10000000)
sync_blockheight(bitcoind, [l1]) # height 102
@ -1138,7 +1139,8 @@ def test_funding_reorg_remote_lags(node_factory, bitcoind):
"""Nodes may disagree about short_channel_id before channel announcement
"""
# may_reconnect so channeld will restart; bad gossip can happen due to reorg
opts = {'funding-confirms': 1, 'may_reconnect': True, 'allow_bad_gossip': True}
opts = {'funding-confirms': 1, 'may_reconnect': True, 'allow_bad_gossip': True,
'allow_warning': True}
l1, l2 = node_factory.line_graph(2, fundchannel=False, opts=opts)
l1.fundwallet(10000000)
sync_blockheight(bitcoind, [l1]) # height 102

View file

@ -129,10 +129,6 @@ def test_multifunding_v2_best_effort(node_factory, bitcoind):
for node in node_list:
node.daemon.wait_for_log(r'to CLOSINGD_COMPLETE')
# Make sure disconnections are complete
if not failed_sign:
wait_for(lambda: all([c['connected'] is False for c in l1.rpc.listpeers()['peers']]))
# With 2 down, it will fail to fund channel
l2.stop()
l3.stop()