From e1c48ebda1ae88d0a38f5cae8d3394420e1465cc Mon Sep 17 00:00:00 2001 From: Pierre-Marie Padiou Date: Wed, 11 Dec 2019 14:16:48 +0100 Subject: [PATCH] Better randomization of reconnection delay (#1250) Randomization is necessary, otherwise if two peers attempt to reconnect to each other in a synchronized fashion, they will enter in a disconnect-reconnect loop. We already had randomization for the initial reconnection attempt, but further reconnection attempts were using a deterministic schedule following an exponential backoff curve. Fixes #1238. --- eclair-core/src/main/scala/fr/acinq/eclair/io/Peer.scala | 9 +++++++-- .../src/test/scala/fr/acinq/eclair/io/PeerSpec.scala | 7 ++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/eclair-core/src/main/scala/fr/acinq/eclair/io/Peer.scala b/eclair-core/src/main/scala/fr/acinq/eclair/io/Peer.scala index 7f16e55de..c721d7e00 100644 --- a/eclair-core/src/main/scala/fr/acinq/eclair/io/Peer.scala +++ b/eclair-core/src/main/scala/fr/acinq/eclair/io/Peer.scala @@ -534,7 +534,7 @@ class Peer(val nodeParams: NodeParams, remoteNodeId: PublicKey, authenticator: A */ onTransition { case INSTANTIATING -> DISCONNECTED => () - case _ -> DISCONNECTED if nodeParams.autoReconnect => setTimer(RECONNECT_TIMER, Reconnect, Random.nextInt(nodeParams.initialRandomReconnectDelay.toMillis.toInt).millis, repeat = false) // we add some randomization to not have peers reconnect to each other exactly at the same time + case _ -> DISCONNECTED if nodeParams.autoReconnect => setTimer(RECONNECT_TIMER, Reconnect, randomizeDelay(nodeParams.initialRandomReconnectDelay), repeat = false) // we add some randomization to not have peers reconnect to each other exactly at the same time case DISCONNECTED -> _ if nodeParams.autoReconnect => cancelTimer(RECONNECT_TIMER) } @@ -615,7 +615,7 @@ object Peer { def channels: Map[_ <: ChannelId, ActorRef] // will be overridden by Map[FinalChannelId, ActorRef] or Map[ChannelId, ActorRef] } case class Nothing() extends Data { override def address_opt = None; override def channels = Map.empty } - case class DisconnectedData(address_opt: Option[InetSocketAddress], channels: Map[FinalChannelId, ActorRef], nextReconnectionDelay: FiniteDuration = 10 seconds) extends Data + case class DisconnectedData(address_opt: Option[InetSocketAddress], channels: Map[FinalChannelId, ActorRef], nextReconnectionDelay: FiniteDuration = randomizeDelay(10 seconds)) extends Data case class InitializingData(address_opt: Option[InetSocketAddress], transport: ActorRef, channels: Map[FinalChannelId, ActorRef], origin_opt: Option[ActorRef], localInit: wire.Init) extends Data case class ConnectedData(address_opt: Option[InetSocketAddress], transport: ActorRef, localInit: wire.Init, remoteInit: wire.Init, channels: Map[ChannelId, ActorRef], rebroadcastDelay: FiniteDuration, gossipTimestampFilter: Option[GossipTimestampFilter] = None, behavior: Behavior = Behavior(), expectedPong_opt: Option[ExpectedPong] = None) extends Data case class ExpectedPong(ping: Ping, timestamp: Long = Platform.currentTime) @@ -708,6 +708,11 @@ object Peer { def hostAndPort2InetSocketAddress(hostAndPort: HostAndPort): InetSocketAddress = new InetSocketAddress(hostAndPort.getHost, hostAndPort.getPort) + /** + * This helps preventing peers reconnection loops due to synchronization of reconnection attempts. + */ + def randomizeDelay(initialRandomReconnectDelay: FiniteDuration): FiniteDuration = Random.nextInt(initialRandomReconnectDelay.toMillis.toInt).millis + /** * Exponential backoff retry with a finite max */ diff --git a/eclair-core/src/test/scala/fr/acinq/eclair/io/PeerSpec.scala b/eclair-core/src/test/scala/fr/acinq/eclair/io/PeerSpec.scala index d889d6c9d..261b82162 100644 --- a/eclair-core/src/test/scala/fr/acinq/eclair/io/PeerSpec.scala +++ b/eclair-core/src/test/scala/fr/acinq/eclair/io/PeerSpec.scala @@ -198,11 +198,12 @@ class PeerSpec extends TestkitBaseClass with StateTestsHelperMethods { connect(remoteNodeId, authenticator, watcher, router, relayer, connection, transport, peer, channels = Set(ChannelCodecsSpec.normal)) probe.send(transport.ref, PoisonPill) awaitCond(peer.stateName === DISCONNECTED) - assert(peer.stateData.asInstanceOf[DisconnectedData].nextReconnectionDelay === (10 seconds)) + val initialReconnectDelay = peer.stateData.asInstanceOf[DisconnectedData].nextReconnectionDelay + assert(initialReconnectDelay <= (10 seconds)) probe.send(peer, Reconnect) - assert(peer.stateData.asInstanceOf[DisconnectedData].nextReconnectionDelay === (20 seconds)) + assert(peer.stateData.asInstanceOf[DisconnectedData].nextReconnectionDelay === (initialReconnectDelay * 2)) probe.send(peer, Reconnect) - assert(peer.stateData.asInstanceOf[DisconnectedData].nextReconnectionDelay === (40 seconds)) + assert(peer.stateData.asInstanceOf[DisconnectedData].nextReconnectionDelay === (initialReconnectDelay * 4)) } test("disconnect if incompatible features") { f =>