mirror of
https://github.com/ElementsProject/lightning.git
synced 2024-11-19 01:43:36 +01:00
connectd: control connect backoff from lightningd.
We used to tell connectd to remember our connect delay, and hand it back (increased if necessary). Instead, simply record when we last tried to connect. If it was less than 10 minutes ago, double delay (up to 5 minutes max), otherwise reset delay to 1 second. This covers all scenarios: whether we reconnect then immediately disconnect, or never successfully connect, it doesn't matter. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Fixes: #5453
This commit is contained in:
parent
9cad7d6a6a
commit
22ff007d64
@ -56,13 +56,6 @@
|
||||
#define HSM_FD 3
|
||||
#define GOSSIPCTL_FD 4
|
||||
|
||||
/*~ In C convention, constants are UPPERCASE macros. Not everything needs to
|
||||
* be a constant, but it soothes the programmer's conscience to encapsulate
|
||||
* arbitrary decisions like these in one place. */
|
||||
#define MAX_CONNECT_ATTEMPTS 10
|
||||
#define INITIAL_WAIT_SECONDS 1
|
||||
#define MAX_WAIT_SECONDS 300
|
||||
|
||||
/* Peers we're trying to reach: we iterate through addrs until we succeed
|
||||
* or fail. */
|
||||
struct connecting {
|
||||
@ -88,9 +81,6 @@ struct connecting {
|
||||
|
||||
/* Accumulated errors */
|
||||
char *errors;
|
||||
|
||||
/* How many seconds did we wait this time? */
|
||||
u32 seconds_waited;
|
||||
};
|
||||
|
||||
/*~ C programs should generally be written bottom-to-top, with the root
|
||||
@ -623,15 +613,13 @@ struct io_plan *connection_out(struct io_conn *conn, struct connecting *connect)
|
||||
*/
|
||||
static void connect_failed(struct daemon *daemon,
|
||||
const struct node_id *id,
|
||||
u32 seconds_waited,
|
||||
const struct wireaddr_internal *addrhint,
|
||||
errcode_t errcode,
|
||||
const char *errfmt, ...)
|
||||
PRINTF_FMT(6,7);
|
||||
PRINTF_FMT(5,6);
|
||||
|
||||
static void connect_failed(struct daemon *daemon,
|
||||
const struct node_id *id,
|
||||
u32 seconds_waited,
|
||||
const struct wireaddr_internal *addrhint,
|
||||
errcode_t errcode,
|
||||
const char *errfmt, ...)
|
||||
@ -639,27 +627,18 @@ static void connect_failed(struct daemon *daemon,
|
||||
u8 *msg;
|
||||
va_list ap;
|
||||
char *errmsg;
|
||||
u32 wait_seconds;
|
||||
|
||||
va_start(ap, errfmt);
|
||||
errmsg = tal_vfmt(tmpctx, errfmt, ap);
|
||||
va_end(ap);
|
||||
|
||||
/* Wait twice as long to reconnect, between min and max. */
|
||||
wait_seconds = seconds_waited * 2;
|
||||
if (wait_seconds > MAX_WAIT_SECONDS)
|
||||
wait_seconds = MAX_WAIT_SECONDS;
|
||||
if (wait_seconds < INITIAL_WAIT_SECONDS)
|
||||
wait_seconds = INITIAL_WAIT_SECONDS;
|
||||
|
||||
status_peer_debug(id, "Failed connected out: %s", errmsg);
|
||||
|
||||
/* lightningd may have a connect command waiting to know what
|
||||
* happened. We leave it to lightningd to decide if it wants to try
|
||||
* again, with the wait_seconds as a hint of how long before
|
||||
* asking. */
|
||||
* again. */
|
||||
msg = towire_connectd_connect_failed(NULL, id, errcode, errmsg,
|
||||
wait_seconds, addrhint);
|
||||
addrhint);
|
||||
daemon_conn_send(daemon->master, take(msg));
|
||||
}
|
||||
|
||||
@ -801,7 +780,6 @@ static void try_connect_one_addr(struct connecting *connect)
|
||||
/* Out of addresses? */
|
||||
if (connect->addrnum == tal_count(connect->addrs)) {
|
||||
connect_failed(connect->daemon, &connect->id,
|
||||
connect->seconds_waited,
|
||||
connect->addrhint, CONNECT_ALL_ADDRESSES_FAILED,
|
||||
"All addresses failed: %s",
|
||||
connect->errors);
|
||||
@ -1748,7 +1726,6 @@ static void add_gossip_addrs(struct wireaddr_internal **addrs,
|
||||
* caller so it's marginal. */
|
||||
static void try_connect_peer(struct daemon *daemon,
|
||||
const struct node_id *id,
|
||||
u32 seconds_waited,
|
||||
struct wireaddr *gossip_addrs,
|
||||
struct wireaddr_internal *addrhint STEALS)
|
||||
{
|
||||
@ -1805,7 +1782,7 @@ static void try_connect_peer(struct daemon *daemon,
|
||||
/* Still no address? Fail immediately. Lightningd can still choose
|
||||
* to retry; an address may get gossiped or appear on the DNS seed. */
|
||||
if (tal_count(addrs) == 0) {
|
||||
connect_failed(daemon, id, seconds_waited, addrhint,
|
||||
connect_failed(daemon, id, addrhint,
|
||||
CONNECT_NO_KNOWN_ADDRESS,
|
||||
"Unable to connect, no address known for peer");
|
||||
return;
|
||||
@ -1822,7 +1799,6 @@ static void try_connect_peer(struct daemon *daemon,
|
||||
* errors which occur. We miss it in a few places; would be nice to
|
||||
* fix! */
|
||||
connect->connstate = "Connection establishment";
|
||||
connect->seconds_waited = seconds_waited;
|
||||
connect->addrhint = tal_steal(connect, addrhint);
|
||||
connect->errors = tal_strdup(connect, "");
|
||||
connect->conn = NULL;
|
||||
@ -1837,16 +1813,14 @@ static void try_connect_peer(struct daemon *daemon,
|
||||
static void connect_to_peer(struct daemon *daemon, const u8 *msg)
|
||||
{
|
||||
struct node_id id;
|
||||
u32 seconds_waited;
|
||||
struct wireaddr_internal *addrhint;
|
||||
struct wireaddr *addrs;
|
||||
|
||||
if (!fromwire_connectd_connect_to_peer(tmpctx, msg,
|
||||
&id, &seconds_waited,
|
||||
&addrs, &addrhint))
|
||||
&id, &addrs, &addrhint))
|
||||
master_badmsg(WIRE_CONNECTD_CONNECT_TO_PEER, msg);
|
||||
|
||||
try_connect_peer(daemon, &id, seconds_waited, addrs, addrhint);
|
||||
try_connect_peer(daemon, &id, addrs, addrhint);
|
||||
}
|
||||
|
||||
/* lightningd tells us a peer should be disconnected. */
|
||||
|
@ -47,7 +47,6 @@ msgdata,connectd_activate_reply,failmsg,?wirestring,
|
||||
# Master -> connectd: connect to a peer.
|
||||
msgtype,connectd_connect_to_peer,2001
|
||||
msgdata,connectd_connect_to_peer,id,node_id,
|
||||
msgdata,connectd_connect_to_peer,seconds_waited,u32,
|
||||
msgdata,connectd_connect_to_peer,len,u32,
|
||||
msgdata,connectd_connect_to_peer,addrs,wireaddr,len
|
||||
msgdata,connectd_connect_to_peer,addrhint,?wireaddr_internal,
|
||||
@ -57,7 +56,6 @@ msgtype,connectd_connect_failed,2020
|
||||
msgdata,connectd_connect_failed,id,node_id,
|
||||
msgdata,connectd_connect_failed,failcode,errcode_t,
|
||||
msgdata,connectd_connect_failed,failreason,wirestring,
|
||||
msgdata,connectd_connect_failed,seconds_to_delay,u32,
|
||||
msgdata,connectd_connect_failed,addrhint,?wireaddr_internal,
|
||||
|
||||
# Connectd -> master: we got a peer.
|
||||
|
|
@ -929,9 +929,7 @@ void channel_set_billboard(struct channel *channel, bool perm, const char *str)
|
||||
}
|
||||
}
|
||||
|
||||
static void channel_err(struct channel *channel,
|
||||
const char *why,
|
||||
bool delay_reconnect)
|
||||
static void channel_err(struct channel *channel, const char *why)
|
||||
{
|
||||
log_info(channel->log, "Peer transient failure in %s: %s",
|
||||
channel_state_name(channel), why);
|
||||
@ -944,8 +942,6 @@ static void channel_err(struct channel *channel,
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
channel->peer->delay_reconnect = delay_reconnect;
|
||||
|
||||
channel_set_owner(channel, NULL);
|
||||
}
|
||||
|
||||
@ -954,7 +950,7 @@ void channel_fail_transient_delayreconnect(struct channel *channel, const char *
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, fmt);
|
||||
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), true);
|
||||
channel_err(channel, tal_vfmt(tmpctx, fmt, ap));
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
@ -963,7 +959,7 @@ void channel_fail_transient(struct channel *channel, const char *fmt, ...)
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, fmt);
|
||||
channel_err(channel, tal_vfmt(tmpctx, fmt, ap), false);
|
||||
channel_err(channel, tal_vfmt(tmpctx, fmt, ap));
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
|
@ -247,7 +247,6 @@ AUTODATA(json_command, &connect_command);
|
||||
struct delayed_reconnect {
|
||||
struct lightningd *ld;
|
||||
struct node_id id;
|
||||
u32 seconds_delayed;
|
||||
struct wireaddr_internal *addrhint;
|
||||
};
|
||||
|
||||
@ -265,7 +264,6 @@ static void gossipd_got_addrs(struct subd *subd,
|
||||
|
||||
connectmsg = towire_connectd_connect_to_peer(NULL,
|
||||
&d->id,
|
||||
d->seconds_delayed,
|
||||
addrs,
|
||||
d->addrhint);
|
||||
subd_send_msg(d->ld->connectd, take(connectmsg));
|
||||
@ -280,7 +278,6 @@ static void do_connect(struct delayed_reconnect *d)
|
||||
subd_req(d, d->ld->gossip, take(msg), -1, 0, gossipd_got_addrs, d);
|
||||
}
|
||||
|
||||
/* peer may be NULL here */
|
||||
static void try_connect(const tal_t *ctx,
|
||||
struct lightningd *ld,
|
||||
const struct node_id *id,
|
||||
@ -293,7 +290,6 @@ static void try_connect(const tal_t *ctx,
|
||||
d = tal(ctx, struct delayed_reconnect);
|
||||
d->ld = ld;
|
||||
d->id = *id;
|
||||
d->seconds_delayed = seconds_delay;
|
||||
d->addrhint = tal_dup_or_null(d, struct wireaddr_internal, addrhint);
|
||||
|
||||
if (!seconds_delay) {
|
||||
@ -316,6 +312,7 @@ static void try_connect(const tal_t *ctx,
|
||||
"in %u seconds",
|
||||
seconds_delay));
|
||||
}
|
||||
peer->last_connect_attempt = time_now();
|
||||
}
|
||||
|
||||
/* We fuzz the timer by up to 1 second, to avoid getting into
|
||||
@ -326,18 +323,34 @@ static void try_connect(const tal_t *ctx,
|
||||
do_connect, d));
|
||||
}
|
||||
|
||||
/*~ In C convention, constants are UPPERCASE macros. Not everything needs to
|
||||
* be a constant, but it soothes the programmer's conscience to encapsulate
|
||||
* arbitrary decisions like these in one place. */
|
||||
#define INITIAL_WAIT_SECONDS 1
|
||||
#define MAX_WAIT_SECONDS 300
|
||||
|
||||
void try_reconnect(const tal_t *ctx,
|
||||
struct peer *peer,
|
||||
u32 seconds_delay,
|
||||
const struct wireaddr_internal *addrhint)
|
||||
{
|
||||
if (!peer->ld->reconnect)
|
||||
return;
|
||||
|
||||
/* Did we last attempt to connect recently? Enter backoff mode. */
|
||||
if (time_less(time_between(time_now(), peer->last_connect_attempt),
|
||||
time_from_sec(MAX_WAIT_SECONDS * 2))) {
|
||||
u32 max = DEV_FAST_RECONNECT(peer->ld->dev_fast_reconnect,
|
||||
3, MAX_WAIT_SECONDS);
|
||||
peer->reconnect_delay *= 2;
|
||||
if (peer->reconnect_delay > max)
|
||||
peer->reconnect_delay = max;
|
||||
} else
|
||||
peer->reconnect_delay = INITIAL_WAIT_SECONDS;
|
||||
|
||||
try_connect(ctx,
|
||||
peer->ld,
|
||||
&peer->id,
|
||||
seconds_delay,
|
||||
peer->reconnect_delay,
|
||||
addrhint);
|
||||
}
|
||||
|
||||
@ -346,7 +359,6 @@ static void connect_failed(struct lightningd *ld,
|
||||
const struct node_id *id,
|
||||
errcode_t errcode,
|
||||
const char *errmsg,
|
||||
const u32 *seconds_to_delay,
|
||||
const struct wireaddr_internal *addrhint)
|
||||
{
|
||||
struct peer *peer;
|
||||
@ -361,17 +373,10 @@ static void connect_failed(struct lightningd *ld,
|
||||
/* If we have an active channel, then reconnect. */
|
||||
peer = peer_by_id(ld, id);
|
||||
if (peer && peer_any_active_channel(peer, NULL)) {
|
||||
u32 delay;
|
||||
if (seconds_to_delay)
|
||||
delay = *seconds_to_delay;
|
||||
else if (peer->delay_reconnect)
|
||||
delay = DEV_FAST_RECONNECT(ld->dev_fast_reconnect, 3, 60);
|
||||
else
|
||||
delay = 1;
|
||||
log_peer_debug(ld->log, id, "Reconnecting in %u seconds", delay);
|
||||
try_reconnect(peer, peer, delay, addrhint);
|
||||
try_reconnect(peer, peer, addrhint);
|
||||
} else
|
||||
log_peer_debug(ld->log, id, "Not reconnecting: %s", peer ? "no active channel" : "no channels");
|
||||
log_peer_debug(ld->log, id, "Not reconnecting: %s",
|
||||
peer ? "no active channel" : "no channels");
|
||||
}
|
||||
|
||||
void connect_failed_disconnect(struct lightningd *ld,
|
||||
@ -379,7 +384,7 @@ void connect_failed_disconnect(struct lightningd *ld,
|
||||
const struct wireaddr_internal *addrhint)
|
||||
{
|
||||
connect_failed(ld, id, CONNECT_DISCONNECTED_DURING,
|
||||
"disconnected during connection", NULL, addrhint);
|
||||
"disconnected during connection", addrhint);
|
||||
}
|
||||
|
||||
static void handle_connect_failed(struct lightningd *ld, const u8 *msg)
|
||||
@ -387,15 +392,14 @@ static void handle_connect_failed(struct lightningd *ld, const u8 *msg)
|
||||
struct node_id id;
|
||||
errcode_t errcode;
|
||||
char *errmsg;
|
||||
u32 seconds_to_delay;
|
||||
struct wireaddr_internal *addrhint;
|
||||
|
||||
if (!fromwire_connectd_connect_failed(tmpctx, msg, &id, &errcode, &errmsg,
|
||||
&seconds_to_delay, &addrhint))
|
||||
&addrhint))
|
||||
fatal("Connect gave bad CONNECTD_CONNECT_FAILED message %s",
|
||||
tal_hex(msg, msg));
|
||||
|
||||
connect_failed(ld, &id, errcode, errmsg, &seconds_to_delay, addrhint);
|
||||
connect_failed(ld, &id, errcode, errmsg, addrhint);
|
||||
}
|
||||
|
||||
void connect_succeeded(struct lightningd *ld, const struct peer *peer,
|
||||
|
@ -20,7 +20,6 @@ void connectd_activate(struct lightningd *ld);
|
||||
|
||||
void try_reconnect(const tal_t *ctx,
|
||||
struct peer *peer,
|
||||
u32 seconds_delay,
|
||||
const struct wireaddr_internal *addrhint);
|
||||
void connect_succeeded(struct lightningd *ld, const struct peer *peer,
|
||||
bool incoming,
|
||||
|
@ -1296,7 +1296,7 @@ wallet_commit_channel(struct lightningd *ld,
|
||||
* reconnect because no channels are active. But the subd
|
||||
* just made it active! */
|
||||
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
|
||||
try_reconnect(channel->peer, channel->peer, 1,
|
||||
try_reconnect(channel->peer, channel->peer,
|
||||
&channel->peer->addr);
|
||||
}
|
||||
|
||||
|
@ -240,7 +240,7 @@ wallet_commit_channel(struct lightningd *ld,
|
||||
* reconnect because no channels are active. But the subd
|
||||
* just made it active! */
|
||||
if (!any_active && channel->peer->connected == PEER_DISCONNECTED) {
|
||||
try_reconnect(channel->peer, channel->peer, 1,
|
||||
try_reconnect(channel->peer, channel->peer,
|
||||
&channel->peer->addr);
|
||||
}
|
||||
|
||||
|
@ -695,7 +695,7 @@ static void dev_register_opts(struct lightningd *ld)
|
||||
"Disable automatic reconnect-attempts by this node, but accept incoming");
|
||||
opt_register_noarg("--dev-fast-reconnect", opt_set_bool,
|
||||
&ld->dev_fast_reconnect,
|
||||
"Make default reconnect delay 3 (not 60) seconds");
|
||||
"Make max default reconnect delay 3 (not 300) seconds");
|
||||
|
||||
opt_register_noarg("--dev-fail-on-subdaemon-fail", opt_set_bool,
|
||||
&ld->dev_subdaemon_fail, opt_hidden);
|
||||
|
@ -100,7 +100,8 @@ struct peer *new_peer(struct lightningd *ld, u64 dbid,
|
||||
list_head_init(&peer->channels);
|
||||
peer->direction = node_id_idx(&peer->ld->id, &peer->id);
|
||||
peer->connected = PEER_DISCONNECTED;
|
||||
peer->delay_reconnect = false;
|
||||
peer->last_connect_attempt.ts.tv_sec
|
||||
= peer->last_connect_attempt.ts.tv_nsec = 0;
|
||||
#if DEVELOPER
|
||||
peer->ignore_htlcs = false;
|
||||
#endif
|
||||
@ -1330,7 +1331,6 @@ void peer_connected(struct lightningd *ld, const u8 *msg)
|
||||
/* We mark peer in "connecting" state until hooks have passed. */
|
||||
assert(peer->connected == PEER_DISCONNECTED);
|
||||
peer->connected = PEER_CONNECTING;
|
||||
peer->delay_reconnect = false;
|
||||
|
||||
/* Update peer address and direction */
|
||||
peer->addr = hook_payload->addr;
|
||||
@ -2025,9 +2025,14 @@ static void setup_peer(struct peer *peer, u32 delay)
|
||||
}
|
||||
|
||||
/* Make sure connectd knows to try reconnecting. */
|
||||
if (connect)
|
||||
try_reconnect(peer, peer, delay, &peer->addr);
|
||||
|
||||
if (connect) {
|
||||
/* To delay, make it seem like we just connected. */
|
||||
if (delay > 0) {
|
||||
peer->reconnect_delay = delay;
|
||||
peer->last_connect_attempt = time_now();
|
||||
}
|
||||
try_reconnect(peer, peer, &peer->addr);
|
||||
}
|
||||
}
|
||||
|
||||
void setup_peers(struct lightningd *ld)
|
||||
|
@ -30,8 +30,10 @@ struct peer {
|
||||
/* Connection counter from connectd. */
|
||||
u64 connectd_counter;
|
||||
|
||||
/* Did we fail badly last time? Don't reconnect too fast. */
|
||||
bool delay_reconnect;
|
||||
/* Last reconnect: if it's recent, we delay by reconnect_delay,
|
||||
* doubling each time. */
|
||||
struct timeabs last_connect_attempt;
|
||||
u32 reconnect_delay;
|
||||
|
||||
/* Our channels */
|
||||
struct list_head channels;
|
||||
|
@ -764,7 +764,6 @@ u8 *towire_warningfmt(const tal_t *ctx UNNEEDED,
|
||||
/* Generated stub for try_reconnect */
|
||||
void try_reconnect(const tal_t *ctx UNNEEDED,
|
||||
struct peer *peer UNNEEDED,
|
||||
u32 seconds_delay UNNEEDED,
|
||||
const struct wireaddr_internal *addrhint UNNEEDED)
|
||||
{ fprintf(stderr, "try_reconnect called!\n"); abort(); }
|
||||
/* Generated stub for version */
|
||||
|
@ -3242,11 +3242,12 @@ def test_feerate_spam(node_factory, chainparams):
|
||||
l1.daemon.wait_for_log('peer_out WIRE_UPDATE_FEE', timeout=5)
|
||||
|
||||
|
||||
@pytest.mark.developer("need dev-feerate")
|
||||
@pytest.mark.developer("need dev-feerate, dev-fast-reconnect")
|
||||
def test_feerate_stress(node_factory, executor):
|
||||
# Third node makes HTLC traffic less predictable.
|
||||
l1, l2, l3 = node_factory.line_graph(3, opts={'commit-time': 100,
|
||||
'may_reconnect': True})
|
||||
'may_reconnect': True,
|
||||
'dev-fast-reconnect': None})
|
||||
|
||||
l1.pay(l2, 10**9 // 2)
|
||||
scid12 = l1.get_channel_scid(l2)
|
||||
|
@ -839,7 +839,6 @@ u8 *towire_warningfmt(const tal_t *ctx UNNEEDED,
|
||||
/* Generated stub for try_reconnect */
|
||||
void try_reconnect(const tal_t *ctx UNNEEDED,
|
||||
struct peer *peer UNNEEDED,
|
||||
u32 seconds_delay UNNEEDED,
|
||||
const struct wireaddr_internal *addrhint UNNEEDED)
|
||||
{ fprintf(stderr, "try_reconnect called!\n"); abort(); }
|
||||
/* Generated stub for watch_txid */
|
||||
|
Loading…
Reference in New Issue
Block a user