mirror of
https://github.com/ElementsProject/lightning.git
synced 2025-02-21 22:31:48 +01:00
lightningd: drive all reconnections out of disconnections.
The only places which should call try_reconnect now are the "connect" command, and the disconnect path when it decides there's still an active channel. This introduces one subtlety: if we disconnect when there's no active channel, but then the subd makes one, we have to catch that case! This temporarily reverts "slow" reconnections to fast ones: see next patch. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
parent
a3c4908f4a
commit
02e169fd27
12 changed files with 98 additions and 64 deletions
|
@ -929,9 +929,9 @@ void channel_set_billboard(struct channel *channel, bool perm, const char *str)
|
|||
}
|
||||
}
|
||||
|
||||
static void err_and_reconnect(struct channel *channel,
|
||||
const char *why,
|
||||
u32 seconds_before_reconnect)
|
||||
static void channel_err(struct channel *channel,
|
||||
const char *why,
|
||||
u32 seconds_before_reconnect /* FIXME: use this! */)
|
||||
{
|
||||
log_info(channel->log, "Peer transient failure in %s: %s",
|
||||
channel_state_name(channel), why);
|
||||
|
@ -946,29 +946,23 @@ static void err_and_reconnect(struct channel *channel,
|
|||
#endif
|
||||
|
||||
channel_set_owner(channel, NULL);
|
||||
|
||||
/* Their address only useful if we connected to them */
|
||||
try_reconnect(channel, channel->peer, seconds_before_reconnect,
|
||||
channel->peer->connected_incoming
|
||||
? NULL
|
||||
: &channel->peer->addr);
|
||||
}
|
||||
|
||||
void channel_fail_reconnect_later(struct channel *channel, const char *fmt, ...)
|
||||
void channel_fail_transient_delayreconnect(struct channel *channel, const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, fmt);
|
||||
err_and_reconnect(channel, tal_vfmt(tmpctx, fmt, ap), 60);
|
||||
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), 60);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
void channel_fail_reconnect(struct channel *channel, const char *fmt, ...)
|
||||
void channel_fail_transient(struct channel *channel, const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, fmt);
|
||||
err_and_reconnect(channel, tal_vfmt(tmpctx, fmt, ap), 1);
|
||||
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), 1);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
|
|
|
@ -374,11 +374,11 @@ const char *channel_state_str(enum channel_state state);
|
|||
void channel_set_owner(struct channel *channel, struct subd *owner);
|
||||
|
||||
/* Channel has failed, but can try again. */
|
||||
void channel_fail_reconnect(struct channel *channel,
|
||||
void channel_fail_transient(struct channel *channel,
|
||||
const char *fmt, ...) PRINTF_FMT(2,3);
|
||||
/* Channel has failed, but can try again after a minute. */
|
||||
void channel_fail_reconnect_later(struct channel *channel,
|
||||
const char *fmt,...) PRINTF_FMT(2,3);
|
||||
void channel_fail_transient_delayreconnect(struct channel *channel,
|
||||
const char *fmt,...) PRINTF_FMT(2,3);
|
||||
|
||||
/* Channel has failed, give up on it. */
|
||||
void channel_fail_permanent(struct channel *channel,
|
||||
|
|
|
@ -319,7 +319,7 @@ static void peer_got_shutdown(struct channel *channel, const u8 *msg)
|
|||
&channel->peer->id,
|
||||
channel->peer->connectd_counter,
|
||||
warning)));
|
||||
channel_fail_reconnect(channel, "Bad shutdown scriptpubkey %s",
|
||||
channel_fail_transient(channel, "Bad shutdown scriptpubkey %s",
|
||||
tal_hex(tmpctx, scriptpubkey));
|
||||
return;
|
||||
}
|
||||
|
@ -638,8 +638,10 @@ bool peer_start_channeld(struct channel *channel,
|
|||
if (!channel->owner) {
|
||||
log_broken(channel->log, "Could not subdaemon channel: %s",
|
||||
strerror(errno));
|
||||
channel_fail_reconnect_later(channel,
|
||||
"Failed to subdaemon channel");
|
||||
/* Disconnect it. */
|
||||
subd_send_msg(ld->connectd,
|
||||
take(towire_connectd_discard_peer(NULL, &channel->peer->id,
|
||||
channel->peer->connectd_counter)));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <common/timeout.h>
|
||||
#include <common/type_to_string.h>
|
||||
#include <common/utils.h>
|
||||
#include <connectd/connectd_wiregen.h>
|
||||
#include <errno.h>
|
||||
#include <gossipd/gossipd_wiregen.h>
|
||||
#include <hsmd/capabilities.h>
|
||||
|
@ -377,8 +378,10 @@ void peer_start_closingd(struct channel *channel, struct peer_fd *peer_fd)
|
|||
if (!channel->owner) {
|
||||
log_broken(channel->log, "Could not subdaemon closing: %s",
|
||||
strerror(errno));
|
||||
channel_fail_reconnect_later(channel,
|
||||
"Failed to subdaemon closing");
|
||||
/* Disconnect it. */
|
||||
subd_send_msg(ld->connectd,
|
||||
take(towire_connectd_discard_peer(NULL, &channel->peer->id,
|
||||
channel->peer->connectd_counter)));
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include <lightningd/channel.h>
|
||||
#include <lightningd/channel_control.h>
|
||||
#include <lightningd/closing_control.h>
|
||||
#include <lightningd/connect_control.h>
|
||||
#include <lightningd/dual_open_control.h>
|
||||
#include <lightningd/gossip_control.h>
|
||||
#include <lightningd/hsm_control.h>
|
||||
|
@ -46,14 +47,11 @@ static void channel_disconnect(struct channel *channel,
|
|||
log_(channel->log, level, NULL, false, "%s", desc);
|
||||
channel_cleanup_commands(channel, desc);
|
||||
|
||||
if (!reconnect)
|
||||
channel_set_owner(channel, NULL);
|
||||
else
|
||||
channel_fail_reconnect(channel, "%s: %s",
|
||||
channel->owner ?
|
||||
channel->owner->name :
|
||||
"dualopend-dead",
|
||||
desc);
|
||||
channel_fail_transient(channel, "%s: %s",
|
||||
channel->owner ?
|
||||
channel->owner->name :
|
||||
"dualopend-dead",
|
||||
desc);
|
||||
}
|
||||
|
||||
void channel_unsaved_close_conn(struct channel *channel, const char *why)
|
||||
|
@ -1179,6 +1177,7 @@ wallet_commit_channel(struct lightningd *ld,
|
|||
{
|
||||
struct amount_msat our_msat, lease_fee_msat;
|
||||
struct channel_inflight *inflight;
|
||||
bool any_active = peer_any_active_channel(channel->peer, NULL);
|
||||
|
||||
if (!amount_sat_to_msat(&our_msat, our_funding)) {
|
||||
log_broken(channel->log, "Unable to convert funds");
|
||||
|
@ -1292,6 +1291,14 @@ wallet_commit_channel(struct lightningd *ld,
|
|||
channel->push);
|
||||
wallet_inflight_add(ld->wallet, inflight);
|
||||
|
||||
/* We might have disconnected and decided we didn't need to
|
||||
* reconnect because no channels are active. But the subd
|
||||
* just made it active! */
|
||||
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
|
||||
try_reconnect(channel->peer, channel->peer, 1,
|
||||
&channel->peer->addr);
|
||||
}
|
||||
|
||||
return inflight;
|
||||
}
|
||||
|
||||
|
@ -1348,13 +1355,13 @@ static void handle_peer_wants_to_close(struct subd *dualopend,
|
|||
"Bad shutdown scriptpubkey %s",
|
||||
tal_hex(tmpctx, scriptpubkey));
|
||||
|
||||
/* Get connectd to send warning, and then allow reconnect. */
|
||||
/* Get connectd to send warning, and kill subd. */
|
||||
subd_send_msg(ld->connectd,
|
||||
take(towire_connectd_peer_final_msg(NULL,
|
||||
&channel->peer->id,
|
||||
channel->peer->connectd_counter,
|
||||
warning)));
|
||||
channel_fail_reconnect(channel, "Bad shutdown scriptpubkey %s",
|
||||
channel_fail_transient(channel, "Bad shutdown scriptpubkey %s",
|
||||
tal_hex(tmpctx, scriptpubkey));
|
||||
return;
|
||||
}
|
||||
|
@ -3408,8 +3415,10 @@ bool peer_restart_dualopend(struct peer *peer,
|
|||
if (!channel->owner) {
|
||||
log_broken(channel->log, "Could not subdaemon channel: %s",
|
||||
strerror(errno));
|
||||
channel_fail_reconnect_later(channel,
|
||||
"Failed to subdaemon channel");
|
||||
/* Disconnect it. */
|
||||
subd_send_msg(peer->ld->connectd,
|
||||
take(towire_connectd_discard_peer(NULL, &channel->peer->id,
|
||||
channel->peer->connectd_counter)));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <lightningd/chaintopology.h>
|
||||
#include <lightningd/channel.h>
|
||||
#include <lightningd/channel_control.h>
|
||||
#include <lightningd/connect_control.h>
|
||||
#include <lightningd/hsm_control.h>
|
||||
#include <lightningd/notification.h>
|
||||
#include <lightningd/opening_common.h>
|
||||
|
@ -100,6 +101,7 @@ wallet_commit_channel(struct lightningd *ld,
|
|||
u32 lease_start_blockheight = 0; /* No leases on v1 */
|
||||
struct short_channel_id *alias_local;
|
||||
struct timeabs timestamp;
|
||||
bool any_active = peer_any_active_channel(uc->peer, NULL);
|
||||
|
||||
/* We cannot both be the fundee *and* have a `fundchannel_start`
|
||||
* command running!
|
||||
|
@ -233,6 +235,15 @@ wallet_commit_channel(struct lightningd *ld,
|
|||
channel->state_change_cause,
|
||||
"new channel opened");
|
||||
|
||||
|
||||
/* We might have disconnected and decided we didn't need to
|
||||
* reconnect because no channels are active. But the subd
|
||||
* just made it active! */
|
||||
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
|
||||
try_reconnect(channel->peer, channel->peer, 1,
|
||||
&channel->peer->addr);
|
||||
}
|
||||
|
||||
return channel;
|
||||
}
|
||||
|
||||
|
|
|
@ -166,7 +166,7 @@ static void peer_channels_cleanup(struct lightningd *ld,
|
|||
c = channels[i];
|
||||
if (channel_active(c)) {
|
||||
channel_cleanup_commands(c, "Disconnected");
|
||||
channel_fail_reconnect(c, "Disconnected");
|
||||
channel_fail_transient(c, "Disconnected");
|
||||
} else if (channel_unsaved(c)) {
|
||||
channel_unsaved_close_conn(c, "Disconnected");
|
||||
}
|
||||
|
@ -357,7 +357,7 @@ void channel_errmsg(struct channel *channel,
|
|||
/* No peer_fd means a subd crash or disconnection. */
|
||||
if (!peer_fd) {
|
||||
/* If the channel is unsaved, we forget it */
|
||||
channel_fail_reconnect(channel, "%s: %s",
|
||||
channel_fail_transient(channel, "%s: %s",
|
||||
channel->owner->name, desc);
|
||||
return;
|
||||
}
|
||||
|
@ -371,8 +371,8 @@ void channel_errmsg(struct channel *channel,
|
|||
* and we would close the channel on them. We now support warnings
|
||||
* for this case. */
|
||||
if (warning) {
|
||||
channel_fail_reconnect_later(channel, "%s WARNING: %s",
|
||||
channel->owner->name, desc);
|
||||
channel_fail_transient_delayreconnect(channel, "%s WARNING: %s",
|
||||
channel->owner->name, desc);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1731,9 +1731,21 @@ static enum watch_result funding_depth_cb(struct lightningd *ld,
|
|||
|
||||
} else if (!short_channel_id_eq(channel->scid, &scid) &&
|
||||
!is_stub_scid(channel->scid)) {
|
||||
/* This normally restarts channeld, initialized with updated scid
|
||||
/* Send warning: that will make connectd disconnect, and then we'll
|
||||
* try to reconnect. */
|
||||
u8 *warning = towire_warningfmt(tmpctx, &channel->cid,
|
||||
"short_channel_id changed to %s (was %s)",
|
||||
short_channel_id_to_str(tmpctx, &scid),
|
||||
short_channel_id_to_str(tmpctx, channel->scid));
|
||||
if (channel->peer->connected != PEER_DISCONNECTED)
|
||||
subd_send_msg(ld->connectd,
|
||||
take(towire_connectd_peer_final_msg(NULL,
|
||||
&channel->peer->id,
|
||||
channel->peer->connectd_counter,
|
||||
warning)));
|
||||
/* When we restart channeld, it will be initialized with updated scid
|
||||
* and also adds it (at least our halve_chan) to rtable. */
|
||||
channel_fail_reconnect(channel,
|
||||
channel_fail_transient_delayreconnect(channel,
|
||||
"short_channel_id changed to %s (was %s)",
|
||||
short_channel_id_to_str(tmpctx, &scid),
|
||||
short_channel_id_to_str(tmpctx, channel->scid));
|
||||
|
|
|
@ -68,14 +68,14 @@ void channel_fail_permanent(struct channel *channel UNNEEDED,
|
|||
const char *fmt UNNEEDED,
|
||||
...)
|
||||
{ fprintf(stderr, "channel_fail_permanent called!\n"); abort(); }
|
||||
/* Generated stub for channel_fail_reconnect */
|
||||
void channel_fail_reconnect(struct channel *channel UNNEEDED,
|
||||
/* Generated stub for channel_fail_transient */
|
||||
void channel_fail_transient(struct channel *channel UNNEEDED,
|
||||
const char *fmt UNNEEDED, ...)
|
||||
{ fprintf(stderr, "channel_fail_reconnect called!\n"); abort(); }
|
||||
/* Generated stub for channel_fail_reconnect_later */
|
||||
void channel_fail_reconnect_later(struct channel *channel UNNEEDED,
|
||||
const char *fmt UNNEEDED,...)
|
||||
{ fprintf(stderr, "channel_fail_reconnect_later called!\n"); abort(); }
|
||||
{ fprintf(stderr, "channel_fail_transient called!\n"); abort(); }
|
||||
/* Generated stub for channel_fail_transient_delayreconnect */
|
||||
void channel_fail_transient_delayreconnect(struct channel *channel UNNEEDED,
|
||||
const char *fmt UNNEEDED,...)
|
||||
{ fprintf(stderr, "channel_fail_transient_delayreconnect called!\n"); abort(); }
|
||||
/* Generated stub for channel_has_htlc_in */
|
||||
struct htlc_in *channel_has_htlc_in(struct channel *channel UNNEEDED)
|
||||
{ fprintf(stderr, "channel_has_htlc_in called!\n"); abort(); }
|
||||
|
|
|
@ -3667,8 +3667,7 @@ We send an HTLC, and peer unilaterally closes: do we close upstream?
|
|||
with pytest.raises(RpcError, match=r'WIRE_TEMPORARY_CHANNEL_FAILURE \(reply from remote\)'):
|
||||
l1.rpc.waitsendpay(ph2, timeout=TIMEOUT)
|
||||
|
||||
# l3 closes unilaterally.
|
||||
wait_for(lambda: only_one(l3.rpc.listpeers(l2.info['id'])['peers'])['connected'] is False)
|
||||
# Make close unilaterally.
|
||||
l3.rpc.close(l2.info['id'], 1)
|
||||
|
||||
l3.daemon.wait_for_log('sendrawtransaction')
|
||||
|
|
|
@ -451,7 +451,8 @@ def test_disconnect_opener(node_factory):
|
|||
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
|
||||
with pytest.raises(RpcError):
|
||||
l1.rpc.fundchannel(l2.info['id'], 25000)
|
||||
assert l1.rpc.getpeer(l2.info['id']) is None
|
||||
# First peer valishes, but later it just disconnects
|
||||
wait_for(lambda: all([p['connected'] is False for p in l1.rpc.listpeers()['peers']]))
|
||||
|
||||
# This one will succeed.
|
||||
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
|
||||
|
@ -495,7 +496,8 @@ def test_disconnect_fundee(node_factory):
|
|||
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
|
||||
with pytest.raises(RpcError):
|
||||
l1.rpc.fundchannel(l2.info['id'], 25000)
|
||||
assert l1.rpc.getpeer(l2.info['id']) is None
|
||||
# First peer valishes, but later it just disconnects
|
||||
wait_for(lambda: all([p['connected'] is False for p in l1.rpc.listpeers()['peers']]))
|
||||
|
||||
# This one will succeed.
|
||||
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
|
||||
|
@ -541,8 +543,8 @@ def test_disconnect_fundee_v2(node_factory):
|
|||
l1.rpc.fundchannel(l2.info['id'], 25000)
|
||||
|
||||
# Should still only have one peer!
|
||||
assert len(l1.rpc.listpeers()) == 1
|
||||
assert len(l2.rpc.listpeers()) == 1
|
||||
assert len(l1.rpc.listpeers()['peers']) == 1
|
||||
assert len(l2.rpc.listpeers()['peers']) == 1
|
||||
|
||||
|
||||
@pytest.mark.developer
|
||||
|
@ -564,8 +566,8 @@ def test_disconnect_half_signed(node_factory):
|
|||
l1.rpc.fundchannel(l2.info['id'], 25000)
|
||||
|
||||
# Peer remembers, opener doesn't.
|
||||
assert l1.rpc.getpeer(l2.info['id']) is None
|
||||
assert l2.rpc.getpeer(l1.info['id'])['id'] == l1.info['id']
|
||||
wait_for(lambda: l1.rpc.listpeers(l2.info['id'])['peers'] == [])
|
||||
assert len(only_one(l2.rpc.listpeers(l1.info['id'])['peers'])['channels']) == 1
|
||||
|
||||
|
||||
@pytest.mark.developer
|
||||
|
@ -3606,7 +3608,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
|
|||
l2.wait_for_onchaind_broadcast('OUR_PENALTY_TX',
|
||||
'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM')
|
||||
bitcoind.generate_block(100)
|
||||
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
|
||||
# This works even if they disconnect and listpeers() is empty:
|
||||
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))
|
||||
|
||||
# TEST 2: Cheat from post-upgrade.
|
||||
node_factory.join_nodes([l1, l2])
|
||||
|
@ -3630,7 +3633,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
|
|||
l2.wait_for_onchaind_broadcast('OUR_PENALTY_TX',
|
||||
'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM')
|
||||
bitcoind.generate_block(100)
|
||||
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
|
||||
# This works even if they disconnect and listpeers() is empty:
|
||||
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))
|
||||
|
||||
# TEST 3: Unilateral close from pre-upgrade
|
||||
node_factory.join_nodes([l1, l2])
|
||||
|
@ -3658,7 +3662,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
|
|||
bitcoind.generate_block(5)
|
||||
bitcoind.generate_block(100, wait_for_mempool=1)
|
||||
|
||||
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
|
||||
# This works even if they disconnect and listpeers() is empty:
|
||||
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))
|
||||
|
||||
# TEST 4: Unilateral close from post-upgrade
|
||||
node_factory.join_nodes([l1, l2])
|
||||
|
@ -3683,7 +3688,8 @@ def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind):
|
|||
bitcoind.generate_block(5)
|
||||
bitcoind.generate_block(100, wait_for_mempool=1)
|
||||
|
||||
wait_for(lambda: len(l2.rpc.listpeers()['peers']) == 0)
|
||||
# This works even if they disconnect and listpeers() is empty:
|
||||
wait_for(lambda: all([p['channels'] == [] for p in l2.rpc.listpeers()['peers']]))
|
||||
|
||||
|
||||
@unittest.skipIf(not EXPERIMENTAL_FEATURES, "upgrade protocol not available")
|
||||
|
|
|
@ -1095,7 +1095,8 @@ def test_funding_reorg_private(node_factory, bitcoind):
|
|||
opts = {'funding-confirms': 2, 'rescan': 10, 'may_reconnect': True,
|
||||
'allow_bad_gossip': True,
|
||||
# gossipd send lightning update for original channel.
|
||||
'allow_broken_log': True}
|
||||
'allow_broken_log': True,
|
||||
'allow_warning': True}
|
||||
l1, l2 = node_factory.line_graph(2, fundchannel=False, opts=opts)
|
||||
l1.fundwallet(10000000)
|
||||
sync_blockheight(bitcoind, [l1]) # height 102
|
||||
|
@ -1138,7 +1139,8 @@ def test_funding_reorg_remote_lags(node_factory, bitcoind):
|
|||
"""Nodes may disagree about short_channel_id before channel announcement
|
||||
"""
|
||||
# may_reconnect so channeld will restart; bad gossip can happen due to reorg
|
||||
opts = {'funding-confirms': 1, 'may_reconnect': True, 'allow_bad_gossip': True}
|
||||
opts = {'funding-confirms': 1, 'may_reconnect': True, 'allow_bad_gossip': True,
|
||||
'allow_warning': True}
|
||||
l1, l2 = node_factory.line_graph(2, fundchannel=False, opts=opts)
|
||||
l1.fundwallet(10000000)
|
||||
sync_blockheight(bitcoind, [l1]) # height 102
|
||||
|
|
|
@ -129,10 +129,6 @@ def test_multifunding_v2_best_effort(node_factory, bitcoind):
|
|||
for node in node_list:
|
||||
node.daemon.wait_for_log(r'to CLOSINGD_COMPLETE')
|
||||
|
||||
# Make sure disconnections are complete
|
||||
if not failed_sign:
|
||||
wait_for(lambda: all([c['connected'] is False for c in l1.rpc.listpeers()['peers']]))
|
||||
|
||||
# With 2 down, it will fail to fund channel
|
||||
l2.stop()
|
||||
l3.stop()
|
||||
|
|
Loading…
Add table
Reference in a new issue