diff --git a/docs/release-notes/eclair-vnext.md b/docs/release-notes/eclair-vnext.md index 8dcdec093..6b9a0b317 100644 --- a/docs/release-notes/eclair-vnext.md +++ b/docs/release-notes/eclair-vnext.md @@ -4,6 +4,12 @@ ## Major changes +### Alternate strategy to avoid mass force-close of channels in certain cases + +The default strategy, when an unhandled exception or internal error happens, is to locally force-close the channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels. + +This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node (with the default behavior, it would "only" cause a local force-close of the channel). + ### Separate log for important notifications Eclair added a new log file (`notifications.log`) for important notifications that require an action from the node operator. diff --git a/eclair-core/src/main/resources/reference.conf b/eclair-core/src/main/resources/reference.conf index 6184114cd..3ee73e2c7 100644 --- a/eclair-core/src/main/resources/reference.conf +++ b/eclair-core/src/main/resources/reference.conf @@ -95,6 +95,16 @@ eclair { max-block-processing-delay = 30 seconds // we add a random delay before processing blocks, capped at this value, to prevent herd effect max-tx-publish-retry-delay = 60 seconds // we add a random delay before retrying failed transaction publication + // The default strategy, when we encounter an unhandled exception or internal error, is to locally force-close the + // channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some + // misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels. + // This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely + // monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a + // revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the + // outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node + // (with the default behavior, it would "only" cause a local force-close of the channel). + unhandled-exception-strategy = "local-close" // local-close or stop + relay { fees { // Fees for public channels diff --git a/eclair-core/src/main/scala/fr/acinq/eclair/NodeParams.scala b/eclair-core/src/main/scala/fr/acinq/eclair/NodeParams.scala index 37fff15cc..0de805f9c 100644 --- a/eclair-core/src/main/scala/fr/acinq/eclair/NodeParams.scala +++ b/eclair-core/src/main/scala/fr/acinq/eclair/NodeParams.scala @@ -22,6 +22,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Crypto, Satoshi} import fr.acinq.eclair.Setup.Seeds import fr.acinq.eclair.blockchain.fee._ import fr.acinq.eclair.channel.Channel +import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy import fr.acinq.eclair.crypto.Noise.KeyPair import fr.acinq.eclair.crypto.keymanager.{ChannelKeyManager, NodeKeyManager} import fr.acinq.eclair.db._ @@ -76,6 +77,7 @@ case class NodeParams(nodeKeyManager: NodeKeyManager, relayParams: RelayParams, reserveToFundingRatio: Double, maxReserveToFundingRatio: Double, + unhandledExceptionStrategy: UnhandledExceptionStrategy, db: Databases, revocationTimeout: FiniteDuration, autoReconnect: Boolean, @@ -357,6 +359,11 @@ object NodeParams extends Logging { PathFindingExperimentConf(experiments.toMap) } + val unhandledExceptionStrategy = config.getString("unhandled-exception-strategy") match { + case "local-close" => UnhandledExceptionStrategy.LocalClose + case "stop" => UnhandledExceptionStrategy.Stop + } + val routerSyncEncodingType = config.getString("router.sync.encoding-type") match { case "uncompressed" => EncodingType.UNCOMPRESSED case "zlib" => EncodingType.COMPRESSED_ZLIB @@ -423,6 +430,7 @@ object NodeParams extends Logging { ), reserveToFundingRatio = config.getDouble("reserve-to-funding-ratio"), maxReserveToFundingRatio = config.getDouble("max-reserve-to-funding-ratio"), + unhandledExceptionStrategy = unhandledExceptionStrategy, db = database, revocationTimeout = FiniteDuration(config.getDuration("revocation-timeout").getSeconds, TimeUnit.SECONDS), autoReconnect = config.getBoolean("auto-reconnect"), diff --git a/eclair-core/src/main/scala/fr/acinq/eclair/channel/Channel.scala b/eclair-core/src/main/scala/fr/acinq/eclair/channel/Channel.scala index a52a11219..c25dae5c1 100644 --- a/eclair-core/src/main/scala/fr/acinq/eclair/channel/Channel.scala +++ b/eclair-core/src/main/scala/fr/acinq/eclair/channel/Channel.scala @@ -127,6 +127,17 @@ object Channel { /** We don't immediately process [[CurrentBlockCount]] to avoid herd effects */ case class ProcessCurrentBlockCount(c: CurrentBlockCount) + // @formatter:off + /** What do we do if we have a local unhandled exception. */ + sealed trait UnhandledExceptionStrategy + object UnhandledExceptionStrategy { + /** Ask our counterparty to close the channel. This may be the best choice for smaller loosely administered nodes.*/ + case object LocalClose extends UnhandledExceptionStrategy + /** Just log an error and stop the node. May be better for larger nodes, to prevent unwanted mass force-close.*/ + case object Stop extends UnhandledExceptionStrategy + } + // @formatter:on + } class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remoteNodeId: PublicKey, blockchain: typed.ActorRef[ZmqWatcher.Command], relayer: ActorRef, txPublisherFactory: Channel.TxPublisherFactory, origin_opt: Option[ActorRef] = None)(implicit ec: ExecutionContext = ExecutionContext.Implicits.global) extends FSM[ChannelState, ChannelData] with FSMDiagnosticActorLogging[ChannelState, ChannelData] { @@ -1669,6 +1680,7 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo case syncSuccess: SyncResult.Success => var sendQueue = Queue.empty[LightningMessage] // normal case, our data is up-to-date + if (channelReestablish.nextLocalCommitmentNumber == 1 && d.commitments.localCommit.index == 0) { // If next_local_commitment_number is 1 in both the channel_reestablish it sent and received, then the node MUST retransmit funding_locked, otherwise it MUST NOT log.debug("re-sending fundingLocked") @@ -2288,7 +2300,8 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo private def handleLocalError(cause: Throwable, d: ChannelData, msg: Option[Any]) = { cause match { - case _: ForcedLocalCommit => log.warning(s"force-closing channel at user request") + case _: ForcedLocalCommit => + log.warning(s"force-closing channel at user request") case _ if msg.exists(_.isInstanceOf[OpenChannel]) || msg.exists(_.isInstanceOf[AcceptChannel]) => // invalid remote channel parameters are logged as warning log.warning(s"${cause.getMessage} while processing msg=${msg.getOrElse("n/a").getClass.getSimpleName} in state=$stateName") @@ -2308,7 +2321,31 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo log.info(s"we have a valid closing tx, publishing it instead of our commitment: closingTxId=${bestUnpublishedClosingTx.tx.txid}") // if we were in the process of closing and already received a closing sig from the counterparty, it's always better to use that handleMutualClose(bestUnpublishedClosingTx, Left(negotiating)) - case dd: HasCommitments => spendLocalCurrent(dd) sending error // otherwise we use our current commitment + case dd: HasCommitments => + cause match { + case _: ChannelException => + // known channel exception: we force close using our current commitment + spendLocalCurrent(dd) sending error + case _ => + // unhandled exception: we apply the configured strategy + nodeParams.unhandledExceptionStrategy match { + case UnhandledExceptionStrategy.LocalClose => + spendLocalCurrent(dd) sending error + case UnhandledExceptionStrategy.Stop => + log.error("unhandled exception: standard procedure would be to force-close the channel, but eclair has been configured to halt instead.") + NotificationsLogger.logFatalError( + s"""stopping node as configured strategy to unhandled exceptions for nodeId=$remoteNodeId channelId=${d.channelId} + | + |Eclair has been configured to shut down when an unhandled exception happens, instead of requesting a + |force-close from the peer. This gives the operator a chance of avoiding an unnecessary mass force-close + |of channels that may be caused by a bug in Eclair, or issues like running out of disk space, etc. + | + |You should get in touch with Eclair developers and provide logs of your node for analysis. + |""".stripMargin, cause) + System.exit(1) + stop(FSM.Shutdown) + } + } case _ => goto(CLOSED) sending error // when there is no commitment yet, we just send an error to our peer and go to CLOSED state } } diff --git a/eclair-core/src/main/scala/fr/acinq/eclair/channel/Helpers.scala b/eclair-core/src/main/scala/fr/acinq/eclair/channel/Helpers.scala index 497bbd767..fbbea6eb7 100644 --- a/eclair-core/src/main/scala/fr/acinq/eclair/channel/Helpers.scala +++ b/eclair-core/src/main/scala/fr/acinq/eclair/channel/Helpers.scala @@ -324,7 +324,7 @@ object Helpers { case class LocalLateUnproven(ourRemoteCommitmentNumber: Long, theirLocalCommitmentNumber: Long) extends Failure case class RemoteLying(ourLocalCommitmentNumber: Long, theirRemoteCommitmentNumber: Long, invalidPerCommitmentSecret: PrivateKey) extends Failure case object RemoteLate extends Failure - } + } // @formatter:on /** diff --git a/eclair-core/src/test/scala/fr/acinq/eclair/TestConstants.scala b/eclair-core/src/test/scala/fr/acinq/eclair/TestConstants.scala index daf8ab41d..832959837 100644 --- a/eclair-core/src/test/scala/fr/acinq/eclair/TestConstants.scala +++ b/eclair-core/src/test/scala/fr/acinq/eclair/TestConstants.scala @@ -20,6 +20,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Satoshi, SatoshiLong, Script} import fr.acinq.eclair.FeatureSupport.{Mandatory, Optional} import fr.acinq.eclair.Features._ import fr.acinq.eclair.blockchain.fee._ +import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy import fr.acinq.eclair.channel.LocalParams import fr.acinq.eclair.crypto.keymanager.{LocalChannelKeyManager, LocalNodeKeyManager} import fr.acinq.eclair.io.{Peer, PeerConnection} @@ -143,6 +144,7 @@ object TestConstants { feeProportionalMillionths = 30)), reserveToFundingRatio = 0.01, // note: not used (overridden below) maxReserveToFundingRatio = 0.05, + unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose, db = TestDatabases.inMemoryDb(), revocationTimeout = 20 seconds, autoReconnect = false, @@ -269,6 +271,7 @@ object TestConstants { feeProportionalMillionths = 30)), reserveToFundingRatio = 0.01, // note: not used (overridden below) maxReserveToFundingRatio = 0.05, + unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose, db = TestDatabases.inMemoryDb(), revocationTimeout = 20 seconds, autoReconnect = false,