mirror of
https://github.com/ACINQ/eclair.git
synced 2025-02-23 14:40:34 +01:00
Alternate strategy for unhandled exceptions (#2037)
This PRs adds an alternate strategy to handle unhandled exceptions. The goal is to avoid unnecessary mass force-close, but it is reserved to advanced users who closely monitor the node. Available strategies are: - local force close of the channel (default) - log an error message and stop the node Default settings maintain the same behavior as before.
This commit is contained in:
parent
2c0c24e1e1
commit
4f458d356c
6 changed files with 67 additions and 3 deletions
|
@ -4,6 +4,12 @@
|
|||
|
||||
## Major changes
|
||||
|
||||
### Alternate strategy to avoid mass force-close of channels in certain cases
|
||||
|
||||
The default strategy, when an unhandled exception or internal error happens, is to locally force-close the channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels.
|
||||
|
||||
This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node (with the default behavior, it would "only" cause a local force-close of the channel).
|
||||
|
||||
### Separate log for important notifications
|
||||
|
||||
Eclair added a new log file (`notifications.log`) for important notifications that require an action from the node operator.
|
||||
|
|
|
@ -95,6 +95,16 @@ eclair {
|
|||
max-block-processing-delay = 30 seconds // we add a random delay before processing blocks, capped at this value, to prevent herd effect
|
||||
max-tx-publish-retry-delay = 60 seconds // we add a random delay before retrying failed transaction publication
|
||||
|
||||
// The default strategy, when we encounter an unhandled exception or internal error, is to locally force-close the
|
||||
// channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some
|
||||
// misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels.
|
||||
// This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely
|
||||
// monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a
|
||||
// revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the
|
||||
// outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node
|
||||
// (with the default behavior, it would "only" cause a local force-close of the channel).
|
||||
unhandled-exception-strategy = "local-close" // local-close or stop
|
||||
|
||||
relay {
|
||||
fees {
|
||||
// Fees for public channels
|
||||
|
|
|
@ -22,6 +22,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Crypto, Satoshi}
|
|||
import fr.acinq.eclair.Setup.Seeds
|
||||
import fr.acinq.eclair.blockchain.fee._
|
||||
import fr.acinq.eclair.channel.Channel
|
||||
import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy
|
||||
import fr.acinq.eclair.crypto.Noise.KeyPair
|
||||
import fr.acinq.eclair.crypto.keymanager.{ChannelKeyManager, NodeKeyManager}
|
||||
import fr.acinq.eclair.db._
|
||||
|
@ -76,6 +77,7 @@ case class NodeParams(nodeKeyManager: NodeKeyManager,
|
|||
relayParams: RelayParams,
|
||||
reserveToFundingRatio: Double,
|
||||
maxReserveToFundingRatio: Double,
|
||||
unhandledExceptionStrategy: UnhandledExceptionStrategy,
|
||||
db: Databases,
|
||||
revocationTimeout: FiniteDuration,
|
||||
autoReconnect: Boolean,
|
||||
|
@ -357,6 +359,11 @@ object NodeParams extends Logging {
|
|||
PathFindingExperimentConf(experiments.toMap)
|
||||
}
|
||||
|
||||
val unhandledExceptionStrategy = config.getString("unhandled-exception-strategy") match {
|
||||
case "local-close" => UnhandledExceptionStrategy.LocalClose
|
||||
case "stop" => UnhandledExceptionStrategy.Stop
|
||||
}
|
||||
|
||||
val routerSyncEncodingType = config.getString("router.sync.encoding-type") match {
|
||||
case "uncompressed" => EncodingType.UNCOMPRESSED
|
||||
case "zlib" => EncodingType.COMPRESSED_ZLIB
|
||||
|
@ -423,6 +430,7 @@ object NodeParams extends Logging {
|
|||
),
|
||||
reserveToFundingRatio = config.getDouble("reserve-to-funding-ratio"),
|
||||
maxReserveToFundingRatio = config.getDouble("max-reserve-to-funding-ratio"),
|
||||
unhandledExceptionStrategy = unhandledExceptionStrategy,
|
||||
db = database,
|
||||
revocationTimeout = FiniteDuration(config.getDuration("revocation-timeout").getSeconds, TimeUnit.SECONDS),
|
||||
autoReconnect = config.getBoolean("auto-reconnect"),
|
||||
|
|
|
@ -127,6 +127,17 @@ object Channel {
|
|||
/** We don't immediately process [[CurrentBlockCount]] to avoid herd effects */
|
||||
case class ProcessCurrentBlockCount(c: CurrentBlockCount)
|
||||
|
||||
// @formatter:off
|
||||
/** What do we do if we have a local unhandled exception. */
|
||||
sealed trait UnhandledExceptionStrategy
|
||||
object UnhandledExceptionStrategy {
|
||||
/** Ask our counterparty to close the channel. This may be the best choice for smaller loosely administered nodes.*/
|
||||
case object LocalClose extends UnhandledExceptionStrategy
|
||||
/** Just log an error and stop the node. May be better for larger nodes, to prevent unwanted mass force-close.*/
|
||||
case object Stop extends UnhandledExceptionStrategy
|
||||
}
|
||||
// @formatter:on
|
||||
|
||||
}
|
||||
|
||||
class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remoteNodeId: PublicKey, blockchain: typed.ActorRef[ZmqWatcher.Command], relayer: ActorRef, txPublisherFactory: Channel.TxPublisherFactory, origin_opt: Option[ActorRef] = None)(implicit ec: ExecutionContext = ExecutionContext.Implicits.global) extends FSM[ChannelState, ChannelData] with FSMDiagnosticActorLogging[ChannelState, ChannelData] {
|
||||
|
@ -1669,6 +1680,7 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
|
|||
case syncSuccess: SyncResult.Success =>
|
||||
var sendQueue = Queue.empty[LightningMessage]
|
||||
// normal case, our data is up-to-date
|
||||
|
||||
if (channelReestablish.nextLocalCommitmentNumber == 1 && d.commitments.localCommit.index == 0) {
|
||||
// If next_local_commitment_number is 1 in both the channel_reestablish it sent and received, then the node MUST retransmit funding_locked, otherwise it MUST NOT
|
||||
log.debug("re-sending fundingLocked")
|
||||
|
@ -2288,7 +2300,8 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
|
|||
|
||||
private def handleLocalError(cause: Throwable, d: ChannelData, msg: Option[Any]) = {
|
||||
cause match {
|
||||
case _: ForcedLocalCommit => log.warning(s"force-closing channel at user request")
|
||||
case _: ForcedLocalCommit =>
|
||||
log.warning(s"force-closing channel at user request")
|
||||
case _ if msg.exists(_.isInstanceOf[OpenChannel]) || msg.exists(_.isInstanceOf[AcceptChannel]) =>
|
||||
// invalid remote channel parameters are logged as warning
|
||||
log.warning(s"${cause.getMessage} while processing msg=${msg.getOrElse("n/a").getClass.getSimpleName} in state=$stateName")
|
||||
|
@ -2308,7 +2321,31 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
|
|||
log.info(s"we have a valid closing tx, publishing it instead of our commitment: closingTxId=${bestUnpublishedClosingTx.tx.txid}")
|
||||
// if we were in the process of closing and already received a closing sig from the counterparty, it's always better to use that
|
||||
handleMutualClose(bestUnpublishedClosingTx, Left(negotiating))
|
||||
case dd: HasCommitments => spendLocalCurrent(dd) sending error // otherwise we use our current commitment
|
||||
case dd: HasCommitments =>
|
||||
cause match {
|
||||
case _: ChannelException =>
|
||||
// known channel exception: we force close using our current commitment
|
||||
spendLocalCurrent(dd) sending error
|
||||
case _ =>
|
||||
// unhandled exception: we apply the configured strategy
|
||||
nodeParams.unhandledExceptionStrategy match {
|
||||
case UnhandledExceptionStrategy.LocalClose =>
|
||||
spendLocalCurrent(dd) sending error
|
||||
case UnhandledExceptionStrategy.Stop =>
|
||||
log.error("unhandled exception: standard procedure would be to force-close the channel, but eclair has been configured to halt instead.")
|
||||
NotificationsLogger.logFatalError(
|
||||
s"""stopping node as configured strategy to unhandled exceptions for nodeId=$remoteNodeId channelId=${d.channelId}
|
||||
|
|
||||
|Eclair has been configured to shut down when an unhandled exception happens, instead of requesting a
|
||||
|force-close from the peer. This gives the operator a chance of avoiding an unnecessary mass force-close
|
||||
|of channels that may be caused by a bug in Eclair, or issues like running out of disk space, etc.
|
||||
|
|
||||
|You should get in touch with Eclair developers and provide logs of your node for analysis.
|
||||
|""".stripMargin, cause)
|
||||
System.exit(1)
|
||||
stop(FSM.Shutdown)
|
||||
}
|
||||
}
|
||||
case _ => goto(CLOSED) sending error // when there is no commitment yet, we just send an error to our peer and go to CLOSED state
|
||||
}
|
||||
}
|
||||
|
|
|
@ -324,7 +324,7 @@ object Helpers {
|
|||
case class LocalLateUnproven(ourRemoteCommitmentNumber: Long, theirLocalCommitmentNumber: Long) extends Failure
|
||||
case class RemoteLying(ourLocalCommitmentNumber: Long, theirRemoteCommitmentNumber: Long, invalidPerCommitmentSecret: PrivateKey) extends Failure
|
||||
case object RemoteLate extends Failure
|
||||
}
|
||||
}
|
||||
// @formatter:on
|
||||
|
||||
/**
|
||||
|
|
|
@ -20,6 +20,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Satoshi, SatoshiLong, Script}
|
|||
import fr.acinq.eclair.FeatureSupport.{Mandatory, Optional}
|
||||
import fr.acinq.eclair.Features._
|
||||
import fr.acinq.eclair.blockchain.fee._
|
||||
import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy
|
||||
import fr.acinq.eclair.channel.LocalParams
|
||||
import fr.acinq.eclair.crypto.keymanager.{LocalChannelKeyManager, LocalNodeKeyManager}
|
||||
import fr.acinq.eclair.io.{Peer, PeerConnection}
|
||||
|
@ -143,6 +144,7 @@ object TestConstants {
|
|||
feeProportionalMillionths = 30)),
|
||||
reserveToFundingRatio = 0.01, // note: not used (overridden below)
|
||||
maxReserveToFundingRatio = 0.05,
|
||||
unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose,
|
||||
db = TestDatabases.inMemoryDb(),
|
||||
revocationTimeout = 20 seconds,
|
||||
autoReconnect = false,
|
||||
|
@ -269,6 +271,7 @@ object TestConstants {
|
|||
feeProportionalMillionths = 30)),
|
||||
reserveToFundingRatio = 0.01, // note: not used (overridden below)
|
||||
maxReserveToFundingRatio = 0.05,
|
||||
unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose,
|
||||
db = TestDatabases.inMemoryDb(),
|
||||
revocationTimeout = 20 seconds,
|
||||
autoReconnect = false,
|
||||
|
|
Loading…
Add table
Reference in a new issue