1
0
Fork 0
mirror of https://github.com/ACINQ/eclair.git synced 2025-02-23 14:40:34 +01:00

Alternate strategy for unhandled exceptions (#2037)

This PRs adds an alternate strategy to handle unhandled exceptions. The goal is to avoid unnecessary mass force-close, but it is reserved to advanced users who closely monitor the node.

Available strategies are:
- local force close of the channel (default)
- log an error message and stop the node

Default settings maintain the same behavior as before.
This commit is contained in:
Pierre-Marie Padiou 2021-10-27 18:49:11 +02:00 committed by GitHub
parent 2c0c24e1e1
commit 4f458d356c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 67 additions and 3 deletions

View file

@ -4,6 +4,12 @@
## Major changes
### Alternate strategy to avoid mass force-close of channels in certain cases
The default strategy, when an unhandled exception or internal error happens, is to locally force-close the channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels.
This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node (with the default behavior, it would "only" cause a local force-close of the channel).
### Separate log for important notifications
Eclair added a new log file (`notifications.log`) for important notifications that require an action from the node operator.

View file

@ -95,6 +95,16 @@ eclair {
max-block-processing-delay = 30 seconds // we add a random delay before processing blocks, capped at this value, to prevent herd effect
max-tx-publish-retry-delay = 60 seconds // we add a random delay before retrying failed transaction publication
// The default strategy, when we encounter an unhandled exception or internal error, is to locally force-close the
// channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some
// misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels.
// This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely
// monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a
// revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the
// outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node
// (with the default behavior, it would "only" cause a local force-close of the channel).
unhandled-exception-strategy = "local-close" // local-close or stop
relay {
fees {
// Fees for public channels

View file

@ -22,6 +22,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Crypto, Satoshi}
import fr.acinq.eclair.Setup.Seeds
import fr.acinq.eclair.blockchain.fee._
import fr.acinq.eclair.channel.Channel
import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy
import fr.acinq.eclair.crypto.Noise.KeyPair
import fr.acinq.eclair.crypto.keymanager.{ChannelKeyManager, NodeKeyManager}
import fr.acinq.eclair.db._
@ -76,6 +77,7 @@ case class NodeParams(nodeKeyManager: NodeKeyManager,
relayParams: RelayParams,
reserveToFundingRatio: Double,
maxReserveToFundingRatio: Double,
unhandledExceptionStrategy: UnhandledExceptionStrategy,
db: Databases,
revocationTimeout: FiniteDuration,
autoReconnect: Boolean,
@ -357,6 +359,11 @@ object NodeParams extends Logging {
PathFindingExperimentConf(experiments.toMap)
}
val unhandledExceptionStrategy = config.getString("unhandled-exception-strategy") match {
case "local-close" => UnhandledExceptionStrategy.LocalClose
case "stop" => UnhandledExceptionStrategy.Stop
}
val routerSyncEncodingType = config.getString("router.sync.encoding-type") match {
case "uncompressed" => EncodingType.UNCOMPRESSED
case "zlib" => EncodingType.COMPRESSED_ZLIB
@ -423,6 +430,7 @@ object NodeParams extends Logging {
),
reserveToFundingRatio = config.getDouble("reserve-to-funding-ratio"),
maxReserveToFundingRatio = config.getDouble("max-reserve-to-funding-ratio"),
unhandledExceptionStrategy = unhandledExceptionStrategy,
db = database,
revocationTimeout = FiniteDuration(config.getDuration("revocation-timeout").getSeconds, TimeUnit.SECONDS),
autoReconnect = config.getBoolean("auto-reconnect"),

View file

@ -127,6 +127,17 @@ object Channel {
/** We don't immediately process [[CurrentBlockCount]] to avoid herd effects */
case class ProcessCurrentBlockCount(c: CurrentBlockCount)
// @formatter:off
/** What do we do if we have a local unhandled exception. */
sealed trait UnhandledExceptionStrategy
object UnhandledExceptionStrategy {
/** Ask our counterparty to close the channel. This may be the best choice for smaller loosely administered nodes.*/
case object LocalClose extends UnhandledExceptionStrategy
/** Just log an error and stop the node. May be better for larger nodes, to prevent unwanted mass force-close.*/
case object Stop extends UnhandledExceptionStrategy
}
// @formatter:on
}
class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remoteNodeId: PublicKey, blockchain: typed.ActorRef[ZmqWatcher.Command], relayer: ActorRef, txPublisherFactory: Channel.TxPublisherFactory, origin_opt: Option[ActorRef] = None)(implicit ec: ExecutionContext = ExecutionContext.Implicits.global) extends FSM[ChannelState, ChannelData] with FSMDiagnosticActorLogging[ChannelState, ChannelData] {
@ -1669,6 +1680,7 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
case syncSuccess: SyncResult.Success =>
var sendQueue = Queue.empty[LightningMessage]
// normal case, our data is up-to-date
if (channelReestablish.nextLocalCommitmentNumber == 1 && d.commitments.localCommit.index == 0) {
// If next_local_commitment_number is 1 in both the channel_reestablish it sent and received, then the node MUST retransmit funding_locked, otherwise it MUST NOT
log.debug("re-sending fundingLocked")
@ -2288,7 +2300,8 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
private def handleLocalError(cause: Throwable, d: ChannelData, msg: Option[Any]) = {
cause match {
case _: ForcedLocalCommit => log.warning(s"force-closing channel at user request")
case _: ForcedLocalCommit =>
log.warning(s"force-closing channel at user request")
case _ if msg.exists(_.isInstanceOf[OpenChannel]) || msg.exists(_.isInstanceOf[AcceptChannel]) =>
// invalid remote channel parameters are logged as warning
log.warning(s"${cause.getMessage} while processing msg=${msg.getOrElse("n/a").getClass.getSimpleName} in state=$stateName")
@ -2308,7 +2321,31 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
log.info(s"we have a valid closing tx, publishing it instead of our commitment: closingTxId=${bestUnpublishedClosingTx.tx.txid}")
// if we were in the process of closing and already received a closing sig from the counterparty, it's always better to use that
handleMutualClose(bestUnpublishedClosingTx, Left(negotiating))
case dd: HasCommitments => spendLocalCurrent(dd) sending error // otherwise we use our current commitment
case dd: HasCommitments =>
cause match {
case _: ChannelException =>
// known channel exception: we force close using our current commitment
spendLocalCurrent(dd) sending error
case _ =>
// unhandled exception: we apply the configured strategy
nodeParams.unhandledExceptionStrategy match {
case UnhandledExceptionStrategy.LocalClose =>
spendLocalCurrent(dd) sending error
case UnhandledExceptionStrategy.Stop =>
log.error("unhandled exception: standard procedure would be to force-close the channel, but eclair has been configured to halt instead.")
NotificationsLogger.logFatalError(
s"""stopping node as configured strategy to unhandled exceptions for nodeId=$remoteNodeId channelId=${d.channelId}
|
|Eclair has been configured to shut down when an unhandled exception happens, instead of requesting a
|force-close from the peer. This gives the operator a chance of avoiding an unnecessary mass force-close
|of channels that may be caused by a bug in Eclair, or issues like running out of disk space, etc.
|
|You should get in touch with Eclair developers and provide logs of your node for analysis.
|""".stripMargin, cause)
System.exit(1)
stop(FSM.Shutdown)
}
}
case _ => goto(CLOSED) sending error // when there is no commitment yet, we just send an error to our peer and go to CLOSED state
}
}

View file

@ -324,7 +324,7 @@ object Helpers {
case class LocalLateUnproven(ourRemoteCommitmentNumber: Long, theirLocalCommitmentNumber: Long) extends Failure
case class RemoteLying(ourLocalCommitmentNumber: Long, theirRemoteCommitmentNumber: Long, invalidPerCommitmentSecret: PrivateKey) extends Failure
case object RemoteLate extends Failure
}
}
// @formatter:on
/**

View file

@ -20,6 +20,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Satoshi, SatoshiLong, Script}
import fr.acinq.eclair.FeatureSupport.{Mandatory, Optional}
import fr.acinq.eclair.Features._
import fr.acinq.eclair.blockchain.fee._
import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy
import fr.acinq.eclair.channel.LocalParams
import fr.acinq.eclair.crypto.keymanager.{LocalChannelKeyManager, LocalNodeKeyManager}
import fr.acinq.eclair.io.{Peer, PeerConnection}
@ -143,6 +144,7 @@ object TestConstants {
feeProportionalMillionths = 30)),
reserveToFundingRatio = 0.01, // note: not used (overridden below)
maxReserveToFundingRatio = 0.05,
unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose,
db = TestDatabases.inMemoryDb(),
revocationTimeout = 20 seconds,
autoReconnect = false,
@ -269,6 +271,7 @@ object TestConstants {
feeProportionalMillionths = 30)),
reserveToFundingRatio = 0.01, // note: not used (overridden below)
maxReserveToFundingRatio = 0.05,
unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose,
db = TestDatabases.inMemoryDb(),
revocationTimeout = 20 seconds,
autoReconnect = false,