From 2255dd4dda5357993b568fc719ce02eb10b69576 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Aug 2019 01:10:11 +0000 Subject: [PATCH] lightningd: avoid thundering herd on restart. The reason lnd was sending sync error was that we were taking more than 30 seconds to send the channel_reestablish after connect. That's understandable on my test node under valgrind, but shouldn't happen normally. However, it seems it has at least once, (see https://github.com/ElementsProject/lightning/issues/2847) : space out startup so it's less likely to happen. Suggested-by: @cfromknecht Signed-off-by: Rusty Russell --- CHANGELOG.md | 1 + lightningd/peer_control.c | 29 ++++++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 728634a4c..2a9806fe6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - plugins: a new boolean field is added to the `init`'s `configuration`, `startup`. It allows a plugin to know if it has been started on `lightningd` startup. - plugins: a new boolean field can be added to a plugin manifest, `dynamic`. It allows a plugin to tell if it can be started or stopped "on-the-fly". - lightningd: check bitcoind version when setup topology and confirm the version not older than v0.15.0. +- startup: space out reconnections on startup if we have more than 5 peers. ### Deprecated diff --git a/lightningd/peer_control.c b/lightningd/peer_control.c index d301aed0b..bfb3476bd 100644 --- a/lightningd/peer_control.c +++ b/lightningd/peer_control.c @@ -1303,7 +1303,7 @@ static const struct json_command close_command = { }; AUTODATA(json_command, &close_command); -static void activate_peer(struct peer *peer) +static void activate_peer(struct peer *peer, u32 delay) { u8 *msg; struct channel *channel; @@ -1313,10 +1313,21 @@ static void activate_peer(struct peer *peer) * knows to try reconnecting. */ channel = peer_active_channel(peer); if (channel && ld->reconnect) { - msg = towire_connectctl_connect_to_peer(NULL, &peer->id, 0, - &peer->addr); - subd_send_msg(ld->connectd, take(msg)); - channel_set_billboard(channel, false, "Attempting to reconnect"); + if (delay > 0) { + channel_set_billboard(channel, false, + tal_fmt(tmpctx, + "Will attempt reconnect " + "in %u seconds", + delay)); + delay_then_reconnect(channel, delay, &peer->addr); + } else { + msg = towire_connectctl_connect_to_peer(NULL, + &peer->id, 0, + &peer->addr); + subd_send_msg(ld->connectd, take(msg)); + channel_set_billboard(channel, false, + "Attempting to reconnect"); + } } list_for_each(&peer->channels, channel, list) { @@ -1328,9 +1339,13 @@ static void activate_peer(struct peer *peer) void activate_peers(struct lightningd *ld) { struct peer *p; + /* Avoid thundering herd: after first five, delay by 1 second. */ + int delay = -5; - list_for_each(&ld->peers, p, list) - activate_peer(p); + list_for_each(&ld->peers, p, list) { + activate_peer(p, delay > 0 ? delay : 0); + delay++; + } } /* Pull peers, channels and HTLCs from db, and wire them up. */