Alternate strategy for unhandled exceptions (#2037)

This PRs adds an alternate strategy to handle unhandled exceptions. The goal is to avoid unnecessary mass force-close, but it is reserved to advanced users who closely monitor the node. Available strategies are: - local force close of the channel (default) - log an error message and stop the node Default settings maintain the same behavior as before.
2025-02-23 14:40:34 +01:00 · 2021-10-27 18:49:11 +02:00 · 2021-10-27 18:49:11 +02:00 · 4f458d356c
commit 4f458d356c
parent 2c0c24e1e1
6 changed files with 67 additions and 3 deletions
--- a/docs/release-notes/eclair-vnext.md
+++ b/docs/release-notes/eclair-vnext.md
@ -4,6 +4,12 @@

 ## Major changes

+### Alternate strategy to avoid mass force-close of channels in certain cases
+
+The default strategy, when an unhandled exception or internal error happens, is to locally force-close the channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels.
+
+This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node (with the default behavior, it would "only" cause a local force-close of the channel).
+
 ### Separate log for important notifications

 Eclair added a new log file (`notifications.log`) for important notifications that require an action from the node operator.
--- a/eclair-core/src/main/resources/reference.conf
+++ b/eclair-core/src/main/resources/reference.conf
@ -95,6 +95,16 @@ eclair {
  max-block-processing-delay = 30 seconds // we add a random delay before processing blocks, capped at this value, to prevent herd effect
  max-tx-publish-retry-delay = 60 seconds // we add a random delay before retrying failed transaction publication

+  // The default strategy, when we encounter an unhandled exception or internal error, is to locally force-close the
+  // channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some
+  // misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels.
+  // This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely
+  // monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a
+  // revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the
+  // outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node
+  // (with the default behavior, it would "only" cause a local force-close of the channel).
+  unhandled-exception-strategy = "local-close" // local-close or stop
+
  relay {
    fees {
      // Fees for public channels
--- a/eclair-core/src/main/scala/fr/acinq/eclair/NodeParams.scala
+++ b/eclair-core/src/main/scala/fr/acinq/eclair/NodeParams.scala
@ -22,6 +22,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Crypto, Satoshi}
 import fr.acinq.eclair.Setup.Seeds
 import fr.acinq.eclair.blockchain.fee._
 import fr.acinq.eclair.channel.Channel
+import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy
 import fr.acinq.eclair.crypto.Noise.KeyPair
 import fr.acinq.eclair.crypto.keymanager.{ChannelKeyManager, NodeKeyManager}
 import fr.acinq.eclair.db._
@ -76,6 +77,7 @@ case class NodeParams(nodeKeyManager: NodeKeyManager,
                      relayParams: RelayParams,
                      reserveToFundingRatio: Double,
                      maxReserveToFundingRatio: Double,
+                      unhandledExceptionStrategy: UnhandledExceptionStrategy,
                      db: Databases,
                      revocationTimeout: FiniteDuration,
                      autoReconnect: Boolean,
@ -357,6 +359,11 @@ object NodeParams extends Logging {
      PathFindingExperimentConf(experiments.toMap)
    }

+    val unhandledExceptionStrategy = config.getString("unhandled-exception-strategy") match {
+      case "local-close" => UnhandledExceptionStrategy.LocalClose
+      case "stop" => UnhandledExceptionStrategy.Stop
+    }
+
    val routerSyncEncodingType = config.getString("router.sync.encoding-type") match {
      case "uncompressed" => EncodingType.UNCOMPRESSED
      case "zlib" => EncodingType.COMPRESSED_ZLIB
@ -423,6 +430,7 @@ object NodeParams extends Logging {
      ),
      reserveToFundingRatio = config.getDouble("reserve-to-funding-ratio"),
      maxReserveToFundingRatio = config.getDouble("max-reserve-to-funding-ratio"),
+      unhandledExceptionStrategy = unhandledExceptionStrategy,
      db = database,
      revocationTimeout = FiniteDuration(config.getDuration("revocation-timeout").getSeconds, TimeUnit.SECONDS),
      autoReconnect = config.getBoolean("auto-reconnect"),
--- a/eclair-core/src/main/scala/fr/acinq/eclair/channel/Channel.scala
+++ b/eclair-core/src/main/scala/fr/acinq/eclair/channel/Channel.scala
@ -127,6 +127,17 @@ object Channel {
  /** We don't immediately process [[CurrentBlockCount]] to avoid herd effects */
  case class ProcessCurrentBlockCount(c: CurrentBlockCount)

+  // @formatter:off
+  /** What do we do if we have a local unhandled exception. */
+  sealed trait UnhandledExceptionStrategy
+  object UnhandledExceptionStrategy {
+    /** Ask our counterparty to close the channel. This may be the best choice for smaller loosely administered nodes.*/
+    case object LocalClose extends UnhandledExceptionStrategy
+    /** Just log an error and stop the node. May be better for larger nodes, to prevent unwanted mass force-close.*/
+    case object Stop extends UnhandledExceptionStrategy
+  }
+  // @formatter:on
+
 }

 class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remoteNodeId: PublicKey, blockchain: typed.ActorRef[ZmqWatcher.Command], relayer: ActorRef, txPublisherFactory: Channel.TxPublisherFactory, origin_opt: Option[ActorRef] = None)(implicit ec: ExecutionContext = ExecutionContext.Implicits.global) extends FSM[ChannelState, ChannelData] with FSMDiagnosticActorLogging[ChannelState, ChannelData] {
@ -1669,6 +1680,7 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
        case syncSuccess: SyncResult.Success =>
          var sendQueue = Queue.empty[LightningMessage]
          // normal case, our data is up-to-date
+
          if (channelReestablish.nextLocalCommitmentNumber == 1 && d.commitments.localCommit.index == 0) {
            // If next_local_commitment_number is 1 in both the channel_reestablish it sent and received, then the node MUST retransmit funding_locked, otherwise it MUST NOT
            log.debug("re-sending fundingLocked")
@ -2288,7 +2300,8 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo

  private def handleLocalError(cause: Throwable, d: ChannelData, msg: Option[Any]) = {
    cause match {
-      case _: ForcedLocalCommit => log.warning(s"force-closing channel at user request")
+      case _: ForcedLocalCommit =>
+        log.warning(s"force-closing channel at user request")
      case _ if msg.exists(_.isInstanceOf[OpenChannel]) || msg.exists(_.isInstanceOf[AcceptChannel]) =>
        // invalid remote channel parameters are logged as warning
        log.warning(s"${cause.getMessage} while processing msg=${msg.getOrElse("n/a").getClass.getSimpleName} in state=$stateName")
@ -2308,7 +2321,31 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
        log.info(s"we have a valid closing tx, publishing it instead of our commitment: closingTxId=${bestUnpublishedClosingTx.tx.txid}")
        // if we were in the process of closing and already received a closing sig from the counterparty, it's always better to use that
        handleMutualClose(bestUnpublishedClosingTx, Left(negotiating))
-      case dd: HasCommitments => spendLocalCurrent(dd) sending error // otherwise we use our current commitment
+      case dd: HasCommitments =>
+        cause match {
+          case _: ChannelException =>
+            // known channel exception: we force close using our current commitment
+            spendLocalCurrent(dd) sending error
+          case _ =>
+            // unhandled exception: we apply the configured strategy
+            nodeParams.unhandledExceptionStrategy match {
+              case UnhandledExceptionStrategy.LocalClose =>
+                spendLocalCurrent(dd) sending error
+              case UnhandledExceptionStrategy.Stop =>
+                log.error("unhandled exception: standard procedure would be to force-close the channel, but eclair has been configured to halt instead.")
+                NotificationsLogger.logFatalError(
+                  s"""stopping node as configured strategy to unhandled exceptions for nodeId=$remoteNodeId channelId=${d.channelId}
+                     |
+                     |Eclair has been configured to shut down when an unhandled exception happens, instead of requesting a
+                     |force-close from the peer. This gives the operator a chance of avoiding an unnecessary mass force-close
+                     |of channels that may be caused by a bug in Eclair, or issues like running out of disk space, etc.
+                     |
+                     |You should get in touch with Eclair developers and provide logs of your node for analysis.
+                     |""".stripMargin, cause)
+                System.exit(1)
+                stop(FSM.Shutdown)
+            }
+        }
      case _ => goto(CLOSED) sending error // when there is no commitment yet, we just send an error to our peer and go to CLOSED state
    }
  }
--- a/eclair-core/src/main/scala/fr/acinq/eclair/channel/Helpers.scala
+++ b/eclair-core/src/main/scala/fr/acinq/eclair/channel/Helpers.scala
@ -324,7 +324,7 @@ object Helpers {
      case class LocalLateUnproven(ourRemoteCommitmentNumber: Long, theirLocalCommitmentNumber: Long) extends Failure
      case class RemoteLying(ourLocalCommitmentNumber: Long, theirRemoteCommitmentNumber: Long, invalidPerCommitmentSecret: PrivateKey) extends Failure
      case object RemoteLate extends Failure
-  }
+    }
    // @formatter:on

    /**
--- a/eclair-core/src/test/scala/fr/acinq/eclair/TestConstants.scala
+++ b/eclair-core/src/test/scala/fr/acinq/eclair/TestConstants.scala
@ -20,6 +20,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Satoshi, SatoshiLong, Script}
 import fr.acinq.eclair.FeatureSupport.{Mandatory, Optional}
 import fr.acinq.eclair.Features._
 import fr.acinq.eclair.blockchain.fee._
+import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy
 import fr.acinq.eclair.channel.LocalParams
 import fr.acinq.eclair.crypto.keymanager.{LocalChannelKeyManager, LocalNodeKeyManager}
 import fr.acinq.eclair.io.{Peer, PeerConnection}
@ -143,6 +144,7 @@ object TestConstants {
          feeProportionalMillionths = 30)),
      reserveToFundingRatio = 0.01, // note: not used (overridden below)
      maxReserveToFundingRatio = 0.05,
+      unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose,
      db = TestDatabases.inMemoryDb(),
      revocationTimeout = 20 seconds,
      autoReconnect = false,
@ -269,6 +271,7 @@ object TestConstants {
          feeProportionalMillionths = 30)),
      reserveToFundingRatio = 0.01, // note: not used (overridden below)
      maxReserveToFundingRatio = 0.05,
+      unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose,
      db = TestDatabases.inMemoryDb(),
      revocationTimeout = 20 seconds,
      autoReconnect = false,