connectd: control connect backoff from lightningd.

We used to tell connectd to remember our connect delay, and hand it
back (increased if necessary).

Instead, simply record when we last tried to connect.  If it was less
than 10 minutes ago, double delay (up to 5 minutes max), otherwise
reset delay to 1 second.

This covers all scenarios: whether we reconnect then immediately
disconnect, or never successfully connect, it doesn't matter.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Fixes: #5453
This commit is contained in:
Rusty Russell 2022-07-28 11:00:36 +09:30
parent 9cad7d6a6a
commit 22ff007d64
13 changed files with 54 additions and 77 deletions

View File

@ -56,13 +56,6 @@
#define HSM_FD 3
#define GOSSIPCTL_FD 4
/*~ In C convention, constants are UPPERCASE macros. Not everything needs to
* be a constant, but it soothes the programmer's conscience to encapsulate
* arbitrary decisions like these in one place. */
#define MAX_CONNECT_ATTEMPTS 10
#define INITIAL_WAIT_SECONDS 1
#define MAX_WAIT_SECONDS 300
/* Peers we're trying to reach: we iterate through addrs until we succeed
* or fail. */
struct connecting {
@ -88,9 +81,6 @@ struct connecting {
/* Accumulated errors */
char *errors;
/* How many seconds did we wait this time? */
u32 seconds_waited;
};
/*~ C programs should generally be written bottom-to-top, with the root
@ -623,15 +613,13 @@ struct io_plan *connection_out(struct io_conn *conn, struct connecting *connect)
*/
static void connect_failed(struct daemon *daemon,
const struct node_id *id,
u32 seconds_waited,
const struct wireaddr_internal *addrhint,
errcode_t errcode,
const char *errfmt, ...)
PRINTF_FMT(6,7);
PRINTF_FMT(5,6);
static void connect_failed(struct daemon *daemon,
const struct node_id *id,
u32 seconds_waited,
const struct wireaddr_internal *addrhint,
errcode_t errcode,
const char *errfmt, ...)
@ -639,27 +627,18 @@ static void connect_failed(struct daemon *daemon,
u8 *msg;
va_list ap;
char *errmsg;
u32 wait_seconds;
va_start(ap, errfmt);
errmsg = tal_vfmt(tmpctx, errfmt, ap);
va_end(ap);
/* Wait twice as long to reconnect, between min and max. */
wait_seconds = seconds_waited * 2;
if (wait_seconds > MAX_WAIT_SECONDS)
wait_seconds = MAX_WAIT_SECONDS;
if (wait_seconds < INITIAL_WAIT_SECONDS)
wait_seconds = INITIAL_WAIT_SECONDS;
status_peer_debug(id, "Failed connected out: %s", errmsg);
/* lightningd may have a connect command waiting to know what
* happened. We leave it to lightningd to decide if it wants to try
* again, with the wait_seconds as a hint of how long before
* asking. */
* again. */
msg = towire_connectd_connect_failed(NULL, id, errcode, errmsg,
wait_seconds, addrhint);
addrhint);
daemon_conn_send(daemon->master, take(msg));
}
@ -801,7 +780,6 @@ static void try_connect_one_addr(struct connecting *connect)
/* Out of addresses? */
if (connect->addrnum == tal_count(connect->addrs)) {
connect_failed(connect->daemon, &connect->id,
connect->seconds_waited,
connect->addrhint, CONNECT_ALL_ADDRESSES_FAILED,
"All addresses failed: %s",
connect->errors);
@ -1748,7 +1726,6 @@ static void add_gossip_addrs(struct wireaddr_internal **addrs,
* caller so it's marginal. */
static void try_connect_peer(struct daemon *daemon,
const struct node_id *id,
u32 seconds_waited,
struct wireaddr *gossip_addrs,
struct wireaddr_internal *addrhint STEALS)
{
@ -1805,7 +1782,7 @@ static void try_connect_peer(struct daemon *daemon,
/* Still no address? Fail immediately. Lightningd can still choose
* to retry; an address may get gossiped or appear on the DNS seed. */
if (tal_count(addrs) == 0) {
connect_failed(daemon, id, seconds_waited, addrhint,
connect_failed(daemon, id, addrhint,
CONNECT_NO_KNOWN_ADDRESS,
"Unable to connect, no address known for peer");
return;
@ -1822,7 +1799,6 @@ static void try_connect_peer(struct daemon *daemon,
* errors which occur. We miss it in a few places; would be nice to
* fix! */
connect->connstate = "Connection establishment";
connect->seconds_waited = seconds_waited;
connect->addrhint = tal_steal(connect, addrhint);
connect->errors = tal_strdup(connect, "");
connect->conn = NULL;
@ -1837,16 +1813,14 @@ static void try_connect_peer(struct daemon *daemon,
static void connect_to_peer(struct daemon *daemon, const u8 *msg)
{
struct node_id id;
u32 seconds_waited;
struct wireaddr_internal *addrhint;
struct wireaddr *addrs;
if (!fromwire_connectd_connect_to_peer(tmpctx, msg,
&id, &seconds_waited,
&addrs, &addrhint))
&id, &addrs, &addrhint))
master_badmsg(WIRE_CONNECTD_CONNECT_TO_PEER, msg);
try_connect_peer(daemon, &id, seconds_waited, addrs, addrhint);
try_connect_peer(daemon, &id, addrs, addrhint);
}
/* lightningd tells us a peer should be disconnected. */

View File

@ -47,7 +47,6 @@ msgdata,connectd_activate_reply,failmsg,?wirestring,
# Master -> connectd: connect to a peer.
msgtype,connectd_connect_to_peer,2001
msgdata,connectd_connect_to_peer,id,node_id,
msgdata,connectd_connect_to_peer,seconds_waited,u32,
msgdata,connectd_connect_to_peer,len,u32,
msgdata,connectd_connect_to_peer,addrs,wireaddr,len
msgdata,connectd_connect_to_peer,addrhint,?wireaddr_internal,
@ -57,7 +56,6 @@ msgtype,connectd_connect_failed,2020
msgdata,connectd_connect_failed,id,node_id,
msgdata,connectd_connect_failed,failcode,errcode_t,
msgdata,connectd_connect_failed,failreason,wirestring,
msgdata,connectd_connect_failed,seconds_to_delay,u32,
msgdata,connectd_connect_failed,addrhint,?wireaddr_internal,
# Connectd -> master: we got a peer.

1 #include <bitcoin/block.h>
47 msgdata,connectd_connect_to_peer,addrs,wireaddr,len msgdata,connectd_connect_to_peer,addrhint,?wireaddr_internal,
48 msgdata,connectd_connect_to_peer,addrhint,?wireaddr_internal, # Connectd->master: connect failed.
49 # Connectd->master: connect failed. msgtype,connectd_connect_failed,2020
msgtype,connectd_connect_failed,2020
50 msgdata,connectd_connect_failed,id,node_id,
51 msgdata,connectd_connect_failed,failcode,errcode_t,
52 msgdata,connectd_connect_failed,failreason,wirestring,
56 msgtype,connectd_peer_connected,2002 msgdata,connectd_peer_connected,id,node_id,
57 msgdata,connectd_peer_connected,id,node_id, msgdata,connectd_peer_connected,counter,u64,
58 msgdata,connectd_peer_connected,counter,u64, msgdata,connectd_peer_connected,addr,wireaddr_internal,
msgdata,connectd_peer_connected,addr,wireaddr_internal,
59 msgdata,connectd_peer_connected,remote_addr,?wireaddr,
60 msgdata,connectd_peer_connected,incoming,bool,
61 msgdata,connectd_peer_connected,flen,u16,

View File

@ -929,9 +929,7 @@ void channel_set_billboard(struct channel *channel, bool perm, const char *str)
}
}
static void channel_err(struct channel *channel,
const char *why,
bool delay_reconnect)
static void channel_err(struct channel *channel, const char *why)
{
log_info(channel->log, "Peer transient failure in %s: %s",
channel_state_name(channel), why);
@ -944,8 +942,6 @@ static void channel_err(struct channel *channel,
return;
}
#endif
channel->peer->delay_reconnect = delay_reconnect;
channel_set_owner(channel, NULL);
}
@ -954,7 +950,7 @@ void channel_fail_transient_delayreconnect(struct channel *channel, const char *
va_list ap;
va_start(ap, fmt);
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), true);
channel_err(channel, tal_vfmt(tmpctx, fmt, ap));
va_end(ap);
}
@ -963,7 +959,7 @@ void channel_fail_transient(struct channel *channel, const char *fmt, ...)
va_list ap;
va_start(ap, fmt);
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), false);
channel_err(channel, tal_vfmt(tmpctx, fmt, ap));
va_end(ap);
}

View File

@ -247,7 +247,6 @@ AUTODATA(json_command, &connect_command);
struct delayed_reconnect {
struct lightningd *ld;
struct node_id id;
u32 seconds_delayed;
struct wireaddr_internal *addrhint;
};
@ -265,7 +264,6 @@ static void gossipd_got_addrs(struct subd *subd,
connectmsg = towire_connectd_connect_to_peer(NULL,
&d->id,
d->seconds_delayed,
addrs,
d->addrhint);
subd_send_msg(d->ld->connectd, take(connectmsg));
@ -280,7 +278,6 @@ static void do_connect(struct delayed_reconnect *d)
subd_req(d, d->ld->gossip, take(msg), -1, 0, gossipd_got_addrs, d);
}
/* peer may be NULL here */
static void try_connect(const tal_t *ctx,
struct lightningd *ld,
const struct node_id *id,
@ -293,7 +290,6 @@ static void try_connect(const tal_t *ctx,
d = tal(ctx, struct delayed_reconnect);
d->ld = ld;
d->id = *id;
d->seconds_delayed = seconds_delay;
d->addrhint = tal_dup_or_null(d, struct wireaddr_internal, addrhint);
if (!seconds_delay) {
@ -316,6 +312,7 @@ static void try_connect(const tal_t *ctx,
"in %u seconds",
seconds_delay));
}
peer->last_connect_attempt = time_now();
}
/* We fuzz the timer by up to 1 second, to avoid getting into
@ -326,18 +323,34 @@ static void try_connect(const tal_t *ctx,
do_connect, d));
}
/*~ In C convention, constants are UPPERCASE macros. Not everything needs to
* be a constant, but it soothes the programmer's conscience to encapsulate
* arbitrary decisions like these in one place. */
#define INITIAL_WAIT_SECONDS 1
#define MAX_WAIT_SECONDS 300
void try_reconnect(const tal_t *ctx,
struct peer *peer,
u32 seconds_delay,
const struct wireaddr_internal *addrhint)
{
if (!peer->ld->reconnect)
return;
/* Did we last attempt to connect recently? Enter backoff mode. */
if (time_less(time_between(time_now(), peer->last_connect_attempt),
time_from_sec(MAX_WAIT_SECONDS * 2))) {
u32 max = DEV_FAST_RECONNECT(peer->ld->dev_fast_reconnect,
3, MAX_WAIT_SECONDS);
peer->reconnect_delay *= 2;
if (peer->reconnect_delay > max)
peer->reconnect_delay = max;
} else
peer->reconnect_delay = INITIAL_WAIT_SECONDS;
try_connect(ctx,
peer->ld,
&peer->id,
seconds_delay,
peer->reconnect_delay,
addrhint);
}
@ -346,7 +359,6 @@ static void connect_failed(struct lightningd *ld,
const struct node_id *id,
errcode_t errcode,
const char *errmsg,
const u32 *seconds_to_delay,
const struct wireaddr_internal *addrhint)
{
struct peer *peer;
@ -361,17 +373,10 @@ static void connect_failed(struct lightningd *ld,
/* If we have an active channel, then reconnect. */
peer = peer_by_id(ld, id);
if (peer && peer_any_active_channel(peer, NULL)) {
u32 delay;
if (seconds_to_delay)
delay = *seconds_to_delay;
else if (peer->delay_reconnect)
delay = DEV_FAST_RECONNECT(ld->dev_fast_reconnect, 3, 60);
else
delay = 1;
log_peer_debug(ld->log, id, "Reconnecting in %u seconds", delay);
try_reconnect(peer, peer, delay, addrhint);
try_reconnect(peer, peer, addrhint);
} else
log_peer_debug(ld->log, id, "Not reconnecting: %s", peer ? "no active channel" : "no channels");
log_peer_debug(ld->log, id, "Not reconnecting: %s",
peer ? "no active channel" : "no channels");
}
void connect_failed_disconnect(struct lightningd *ld,
@ -379,7 +384,7 @@ void connect_failed_disconnect(struct lightningd *ld,
const struct wireaddr_internal *addrhint)
{
connect_failed(ld, id, CONNECT_DISCONNECTED_DURING,
"disconnected during connection", NULL, addrhint);
"disconnected during connection", addrhint);
}
static void handle_connect_failed(struct lightningd *ld, const u8 *msg)
@ -387,15 +392,14 @@ static void handle_connect_failed(struct lightningd *ld, const u8 *msg)
struct node_id id;
errcode_t errcode;
char *errmsg;
u32 seconds_to_delay;
struct wireaddr_internal *addrhint;
if (!fromwire_connectd_connect_failed(tmpctx, msg, &id, &errcode, &errmsg,
&seconds_to_delay, &addrhint))
&addrhint))
fatal("Connect gave bad CONNECTD_CONNECT_FAILED message %s",
tal_hex(msg, msg));
connect_failed(ld, &id, errcode, errmsg, &seconds_to_delay, addrhint);
connect_failed(ld, &id, errcode, errmsg, addrhint);
}
void connect_succeeded(struct lightningd *ld, const struct peer *peer,

View File

@ -20,7 +20,6 @@ void connectd_activate(struct lightningd *ld);
void try_reconnect(const tal_t *ctx,
struct peer *peer,
u32 seconds_delay,
const struct wireaddr_internal *addrhint);
void connect_succeeded(struct lightningd *ld, const struct peer *peer,
bool incoming,

View File

@ -1296,7 +1296,7 @@ wallet_commit_channel(struct lightningd *ld,
* reconnect because no channels are active. But the subd
* just made it active! */
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
try_reconnect(channel->peer, channel->peer, 1,
try_reconnect(channel->peer, channel->peer,
&channel->peer->addr);
}

View File

@ -240,7 +240,7 @@ wallet_commit_channel(struct lightningd *ld,
* reconnect because no channels are active. But the subd
* just made it active! */
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
try_reconnect(channel->peer, channel->peer, 1,
try_reconnect(channel->peer, channel->peer,
&channel->peer->addr);
}

View File

@ -695,7 +695,7 @@ static void dev_register_opts(struct lightningd *ld)
"Disable automatic reconnect-attempts by this node, but accept incoming");
opt_register_noarg("--dev-fast-reconnect", opt_set_bool,
&ld->dev_fast_reconnect,
"Make default reconnect delay 3 (not 60) seconds");
"Make max default reconnect delay 3 (not 300) seconds");
opt_register_noarg("--dev-fail-on-subdaemon-fail", opt_set_bool,
&ld->dev_subdaemon_fail, opt_hidden);

View File

@ -100,7 +100,8 @@ struct peer *new_peer(struct lightningd *ld, u64 dbid,
list_head_init(&peer->channels);
peer->direction = node_id_idx(&peer->ld->id, &peer->id);
peer->connected = PEER_DISCONNECTED;
peer->delay_reconnect = false;
peer->last_connect_attempt.ts.tv_sec
= peer->last_connect_attempt.ts.tv_nsec = 0;
#if DEVELOPER
peer->ignore_htlcs = false;
#endif
@ -1330,7 +1331,6 @@ void peer_connected(struct lightningd *ld, const u8 *msg)
/* We mark peer in "connecting" state until hooks have passed. */
assert(peer->connected == PEER_DISCONNECTED);
peer->connected = PEER_CONNECTING;
peer->delay_reconnect = false;
/* Update peer address and direction */
peer->addr = hook_payload->addr;
@ -2025,9 +2025,14 @@ static void setup_peer(struct peer *peer, u32 delay)
}
/* Make sure connectd knows to try reconnecting. */
if (connect)
try_reconnect(peer, peer, delay, &peer->addr);
if (connect) {
/* To delay, make it seem like we just connected. */
if (delay > 0) {
peer->reconnect_delay = delay;
peer->last_connect_attempt = time_now();
}
try_reconnect(peer, peer, &peer->addr);
}
}
void setup_peers(struct lightningd *ld)

View File

@ -30,8 +30,10 @@ struct peer {
/* Connection counter from connectd. */
u64 connectd_counter;
/* Did we fail badly last time? Don't reconnect too fast. */
bool delay_reconnect;
/* Last reconnect: if it's recent, we delay by reconnect_delay,
* doubling each time. */
struct timeabs last_connect_attempt;
u32 reconnect_delay;
/* Our channels */
struct list_head channels;

View File

@ -764,7 +764,6 @@ u8 *towire_warningfmt(const tal_t *ctx UNNEEDED,
/* Generated stub for try_reconnect */
void try_reconnect(const tal_t *ctx UNNEEDED,
struct peer *peer UNNEEDED,
u32 seconds_delay UNNEEDED,
const struct wireaddr_internal *addrhint UNNEEDED)
{ fprintf(stderr, "try_reconnect called!\n"); abort(); }
/* Generated stub for version */

View File

@ -3242,11 +3242,12 @@ def test_feerate_spam(node_factory, chainparams):
l1.daemon.wait_for_log('peer_out WIRE_UPDATE_FEE', timeout=5)
@pytest.mark.developer("need dev-feerate")
@pytest.mark.developer("need dev-feerate, dev-fast-reconnect")
def test_feerate_stress(node_factory, executor):
# Third node makes HTLC traffic less predictable.
l1, l2, l3 = node_factory.line_graph(3, opts={'commit-time': 100,
'may_reconnect': True})
'may_reconnect': True,
'dev-fast-reconnect': None})
l1.pay(l2, 10**9 // 2)
scid12 = l1.get_channel_scid(l2)

View File

@ -839,7 +839,6 @@ u8 *towire_warningfmt(const tal_t *ctx UNNEEDED,
/* Generated stub for try_reconnect */
void try_reconnect(const tal_t *ctx UNNEEDED,
struct peer *peer UNNEEDED,
u32 seconds_delay UNNEEDED,
const struct wireaddr_internal *addrhint UNNEEDED)
{ fprintf(stderr, "try_reconnect called!\n"); abort(); }
/* Generated stub for watch_txid */