mirror of
https://github.com/ACINQ/eclair.git
synced 2025-02-22 22:25:26 +01:00
Postgres: add safety checks at startup (#2140)
When using postgres, at startup we optionnally run a few basic safety checks, e.g. the number of local channels, how long since the last local channel update, etc. The goal is to make sure that we are connected to the correct database instance.
This commit is contained in:
parent
f8d507bbdd
commit
c180ca2ef1
5 changed files with 222 additions and 5 deletions
|
@ -75,6 +75,28 @@ This is particularly useful for payment hubs that generate a lot of invoices (e.
|
|||
Eclair includes a small `payment_metadata` field in all invoices it generates.
|
||||
This lets node operators verify that payers actually support that feature.
|
||||
|
||||
### Optional safety checks when using Postgres
|
||||
|
||||
When using postgres, at startup we optionally run a few basic safety checks, e.g. the number of local channels, how long since the last local channel update, etc. The goal is to make sure that we are connected to the correct database instance.
|
||||
|
||||
Those checks are disabled by default because they wouldn't pass on a fresh new node with zero channels. You should enable them when you already have channels, so that there is something to compare to, and the values should be specific to your setup, particularly for local channels. Configuration is done by overriding `max-age` and `min-count` values in your `eclair.conf`:
|
||||
```
|
||||
eclair.db.postgres.safety-checks
|
||||
{
|
||||
enabled = true
|
||||
max-age {
|
||||
local-channels = 3 minutes
|
||||
network-nodes = 30 minutes
|
||||
audit-relayed = 10 minutes
|
||||
}
|
||||
min-count {
|
||||
local-channels = 10
|
||||
network-nodes = 3000
|
||||
network-channels = 20000
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### API changes
|
||||
|
||||
#### Timestamps
|
||||
|
|
|
@ -327,6 +327,25 @@ eclair {
|
|||
lock-timeout = 5 seconds // timeout for the lock statement on the lease table
|
||||
auto-release-at-shutdown = true // automatically release the lock when eclair is stopping
|
||||
}
|
||||
safety-checks {
|
||||
// A set of basic checks on data to make sure we use the correct database
|
||||
// Those checks are disabled by default because they wouldn't pass on a fresh new node with
|
||||
// zero channels. You should enable them when you already have channels, so that there is
|
||||
// something to compare to, and the values should be specific to your setup, especially
|
||||
// for local channels. If your operate a busy node, you can reduce max-age.local-channels
|
||||
// and max-age.audit-relayed to just a few minutes, this will significantly improve the safety.
|
||||
enabled = false
|
||||
max-age {
|
||||
local-channels = 15 minutes // last time a local channel was updated
|
||||
network-nodes = 30 minutes // most recent public node announcement
|
||||
audit-relayed = 1 hour // last time a payment was relayed
|
||||
}
|
||||
min-count {
|
||||
local-channels = 10 // minimum number of local channels, this entirely depends on your setup
|
||||
network-nodes = 3000 // minimum number of public nodes in the routing table
|
||||
network-channels = 20000 // minimum number of public channels in the routing table
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ import akka.Done
|
|||
import akka.actor.{ActorSystem, CoordinatedShutdown}
|
||||
import com.typesafe.config.Config
|
||||
import com.zaxxer.hikari.{HikariConfig, HikariDataSource}
|
||||
import fr.acinq.eclair.TimestampMilli
|
||||
import fr.acinq.eclair.db.pg.PgUtils.PgLock.LockFailureHandler
|
||||
import fr.acinq.eclair.db.pg.PgUtils._
|
||||
import fr.acinq.eclair.db.pg._
|
||||
|
@ -30,6 +31,7 @@ import java.io.File
|
|||
import java.nio.file._
|
||||
import java.sql.Connection
|
||||
import java.util.UUID
|
||||
import java.util.concurrent.TimeUnit
|
||||
import scala.concurrent.Future
|
||||
import scala.concurrent.duration._
|
||||
|
||||
|
@ -94,12 +96,22 @@ object Databases extends Logging {
|
|||
}
|
||||
|
||||
object PostgresDatabases {
|
||||
|
||||
case class SafetyChecks(localChannelsMaxAge: FiniteDuration,
|
||||
networkNodesMaxAge: FiniteDuration,
|
||||
auditRelayedMaxAge: FiniteDuration,
|
||||
localChannelsMinCount: Int,
|
||||
networkNodesMinCount: Int,
|
||||
networkChannelsMinCount: Int
|
||||
)
|
||||
|
||||
def apply(hikariConfig: HikariConfig,
|
||||
instanceId: UUID,
|
||||
lock: PgLock = PgLock.NoLock,
|
||||
jdbcUrlFile_opt: Option[File],
|
||||
readOnlyUser_opt: Option[String],
|
||||
resetJsonColumns: Boolean)(implicit system: ActorSystem): PostgresDatabases = {
|
||||
resetJsonColumns: Boolean,
|
||||
safetyChecks_opt: Option[SafetyChecks])(implicit system: ActorSystem): PostgresDatabases = {
|
||||
|
||||
jdbcUrlFile_opt.foreach(jdbcUrlFile => checkIfDatabaseUrlIsUnchanged(hikariConfig.getJdbcUrl, jdbcUrlFile))
|
||||
|
||||
|
@ -162,6 +174,71 @@ object Databases extends Logging {
|
|||
}
|
||||
}
|
||||
|
||||
safetyChecks_opt foreach { initChecks =>
|
||||
|
||||
PgUtils.inTransaction { connection =>
|
||||
using(connection.createStatement()) { statement =>
|
||||
|
||||
def checkMaxAge(name: String, maxAge: FiniteDuration, sqlQuery: String): Unit = {
|
||||
import ExtendedResultSet._
|
||||
val smallestAge_opt = statement
|
||||
.executeQuery(sqlQuery)
|
||||
.headOption // sql max() will always return a result, with perhaps a null value if there was no records
|
||||
.flatMap(_.getTimestampNullable("max"))
|
||||
.map(ts => TimestampMilli.now() - TimestampMilli.fromSqlTimestamp(ts))
|
||||
require(smallestAge_opt.isDefined, s"db check failed: no $name found")
|
||||
require(smallestAge_opt.get <= maxAge, s"db check failed: most recent $name is too old (${smallestAge_opt.get.toMinutes} minutes > ${maxAge.toMinutes} minutes)")
|
||||
logger.info(s"db check ok: max age ${smallestAge_opt.get.toMinutes} minutes <= ${maxAge.toMinutes} minutes for $name")
|
||||
}
|
||||
|
||||
checkMaxAge(name = "local channel",
|
||||
maxAge = initChecks.localChannelsMaxAge,
|
||||
sqlQuery =
|
||||
"""
|
||||
|SELECT MAX(GREATEST(created_timestamp, last_payment_sent_timestamp, last_payment_received_timestamp, last_connected_timestamp, closed_timestamp))
|
||||
|FROM local.channels
|
||||
|WHERE NOT is_closed""".stripMargin)
|
||||
|
||||
checkMaxAge(name = "network node",
|
||||
maxAge = initChecks.networkNodesMaxAge,
|
||||
sqlQuery =
|
||||
"""
|
||||
|SELECT MAX((json->'timestamp'->>'iso')::timestamptz)
|
||||
|FROM network.nodes""".stripMargin)
|
||||
|
||||
checkMaxAge(name = "audit relayed",
|
||||
maxAge = initChecks.auditRelayedMaxAge,
|
||||
sqlQuery =
|
||||
"""
|
||||
|SELECT MAX(timestamp)
|
||||
|FROM audit.relayed""".stripMargin)
|
||||
|
||||
def checkMinCount(name: String, minCount: Int, sqlQuery: String): Unit = {
|
||||
import ExtendedResultSet._
|
||||
val count = statement
|
||||
.executeQuery(sqlQuery)
|
||||
.map(_.getInt("count"))
|
||||
.head // NB: COUNT(*) always returns exactly one row
|
||||
require(count >= minCount, s"db check failed: min count not reached for $name ($count < $minCount)")
|
||||
logger.info(s"db check ok: min count $count > $minCount for $name")
|
||||
}
|
||||
|
||||
checkMinCount(name = "local channels",
|
||||
minCount = initChecks.localChannelsMinCount,
|
||||
sqlQuery = "SELECT COUNT(*) FROM local.channels")
|
||||
|
||||
checkMinCount(name = "network node",
|
||||
minCount = initChecks.networkNodesMinCount,
|
||||
sqlQuery = "SELECT COUNT(*) FROM network.nodes")
|
||||
|
||||
checkMinCount(name = "network channels",
|
||||
minCount = initChecks.networkChannelsMinCount,
|
||||
sqlQuery = "SELECT COUNT(*) FROM network.public_channels")
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
databases
|
||||
}
|
||||
|
||||
|
@ -245,13 +322,25 @@ object Databases extends Logging {
|
|||
|
||||
val jdbcUrlFile = new File(dbdir, "last_jdbcurl")
|
||||
|
||||
val safetyChecks_opt = if (dbConfig.getBoolean("postgres.safety-checks.enabled")) {
|
||||
Some(PostgresDatabases.SafetyChecks(
|
||||
localChannelsMaxAge = FiniteDuration(dbConfig.getDuration("postgres.safety-checks.max-age.local-channels").getSeconds, TimeUnit.SECONDS),
|
||||
networkNodesMaxAge = FiniteDuration(dbConfig.getDuration("postgres.safety-checks.max-age.network-nodes").getSeconds, TimeUnit.SECONDS),
|
||||
auditRelayedMaxAge = FiniteDuration(dbConfig.getDuration("postgres.safety-checks.max-age.audit-relayed").getSeconds, TimeUnit.SECONDS),
|
||||
localChannelsMinCount = dbConfig.getInt("postgres.safety-checks.min-count.local-channels"),
|
||||
networkNodesMinCount = dbConfig.getInt("postgres.safety-checks.min-count.network-nodes"),
|
||||
networkChannelsMinCount = dbConfig.getInt("postgres.safety-checks.min-count.network-channels"),
|
||||
))
|
||||
} else None
|
||||
|
||||
Databases.PostgresDatabases(
|
||||
hikariConfig = hikariConfig,
|
||||
instanceId = instanceId,
|
||||
lock = lock,
|
||||
jdbcUrlFile_opt = Some(jdbcUrlFile),
|
||||
readOnlyUser_opt = readOnlyUser_opt,
|
||||
resetJsonColumns = resetJsonColumns
|
||||
resetJsonColumns = resetJsonColumns,
|
||||
safetyChecks_opt = safetyChecks_opt
|
||||
)
|
||||
}
|
||||
|
||||
|
|
|
@ -111,7 +111,7 @@ object TestDatabases {
|
|||
// @formatter:off
|
||||
override val connection: PgConnection = pg.getPostgresDatabase.getConnection.asInstanceOf[PgConnection]
|
||||
// NB: we use a lazy val here: databases won't be initialized until we reference that variable
|
||||
override lazy val db: Databases = Databases.PostgresDatabases(hikariConfig, UUID.randomUUID(), lock, jdbcUrlFile_opt = Some(jdbcUrlFile), readOnlyUser_opt = None, resetJsonColumns = false)
|
||||
override lazy val db: Databases = Databases.PostgresDatabases(hikariConfig, UUID.randomUUID(), lock, jdbcUrlFile_opt = Some(jdbcUrlFile), readOnlyUser_opt = None, resetJsonColumns = false, safetyChecks_opt = None)
|
||||
override def close(): Unit = pg.close()
|
||||
// @formatter:on
|
||||
}
|
||||
|
|
|
@ -1,11 +1,16 @@
|
|||
package fr.acinq.eclair.db
|
||||
|
||||
import com.opentable.db.postgres.embedded.EmbeddedPostgres
|
||||
import com.typesafe.config.{Config, ConfigFactory}
|
||||
import com.typesafe.config.{Config, ConfigFactory, ConfigValue}
|
||||
import fr.acinq.eclair.db.DbEventHandler.ChannelEvent
|
||||
import fr.acinq.eclair.db.pg.PgUtils.ExtendedResultSet._
|
||||
import fr.acinq.eclair.db.pg.PgUtils.PgLock.{LeaseLock, LockFailure, LockFailureHandler}
|
||||
import fr.acinq.eclair.db.pg.PgUtils.{JdbcUrlChanged, migrateTable, using}
|
||||
import fr.acinq.eclair.{TestKitBaseClass, TestUtils}
|
||||
import fr.acinq.eclair.payment.ChannelPaymentRelayed
|
||||
import fr.acinq.eclair.router.Announcements
|
||||
import fr.acinq.eclair.wire.internal.channel.ChannelCodecsSpec
|
||||
import fr.acinq.eclair.wire.protocol.Color
|
||||
import fr.acinq.eclair.{Features, MilliSatoshiLong, TestKitBaseClass, TestUtils, TimestampMilli, TimestampSecond, randomBytes32, randomKey}
|
||||
import grizzled.slf4j.{Logger, Logging}
|
||||
import org.postgresql.jdbc.PgConnection
|
||||
import org.postgresql.util.PGInterval
|
||||
|
@ -153,6 +158,74 @@ class PgUtilsSpec extends TestKitBaseClass with AnyFunSuiteLike with Eventually
|
|||
Databases.postgres(config, UUID.randomUUID(), datadir, LockFailureHandler.logAndThrow)
|
||||
}
|
||||
|
||||
test("safety checks") {
|
||||
val pg = EmbeddedPostgres.start()
|
||||
val baseConfig = ConfigFactory.parseString("postgres.lock-type=none").withFallback(PgUtilsSpec.testConfig(pg.getPort))
|
||||
val datadir = new File(TestUtils.BUILD_DIRECTORY, s"pg_test_${UUID.randomUUID()}")
|
||||
datadir.mkdirs()
|
||||
|
||||
{
|
||||
val db = Databases.postgres(baseConfig, UUID.randomUUID(), datadir, LockFailureHandler.logAndThrow)
|
||||
db.channels.addOrUpdateChannel(ChannelCodecsSpec.normal)
|
||||
db.channels.updateChannelMeta(ChannelCodecsSpec.normal.channelId, ChannelEvent.EventType.Created)
|
||||
db.network.addNode(Announcements.makeNodeAnnouncement(randomKey(), "node-A", Color(50, 99, -80), Nil, Features.empty, TimestampSecond.now() - 45.days))
|
||||
db.network.addNode(Announcements.makeNodeAnnouncement(randomKey(), "node-B", Color(50, 99, -80), Nil, Features.empty, TimestampSecond.now() - 3.days))
|
||||
db.network.addNode(Announcements.makeNodeAnnouncement(randomKey(), "node-C", Color(50, 99, -80), Nil, Features.empty, TimestampSecond.now() - 7.minutes))
|
||||
db.audit.add(ChannelPaymentRelayed(421 msat, 400 msat, randomBytes32(), randomBytes32(), randomBytes32(), TimestampMilli.now() - 3.seconds))
|
||||
db.dataSource.close()
|
||||
}
|
||||
|
||||
{
|
||||
val safetyConfig = ConfigFactory.parseString(
|
||||
s"""
|
||||
|postgres {
|
||||
| safety-checks {
|
||||
| // a set of basic checks on data to make sure we use the correct database
|
||||
| enabled = true
|
||||
| max-age {
|
||||
| local-channels = 3 minutes
|
||||
| network-nodes = 30 minutes
|
||||
| audit-relayed = 10 minutes
|
||||
| }
|
||||
| min-count {
|
||||
| local-channels = 1
|
||||
| network-nodes = 2
|
||||
| network-channels = 0
|
||||
| }
|
||||
| }
|
||||
|}""".stripMargin)
|
||||
val config = safetyConfig.withFallback(baseConfig)
|
||||
val db = Databases.postgres(config, UUID.randomUUID(), datadir, LockFailureHandler.logAndThrow)
|
||||
db.dataSource.close()
|
||||
}
|
||||
|
||||
{
|
||||
val safetyConfig = ConfigFactory.parseString(
|
||||
s"""
|
||||
|postgres {
|
||||
| safety-checks {
|
||||
| // a set of basic checks on data to make sure we use the correct database
|
||||
| enabled = true
|
||||
| max-age {
|
||||
| local-channels = 3 minutes
|
||||
| network-nodes = 30 minutes
|
||||
| audit-relayed = 10 minutes
|
||||
| }
|
||||
| min-count {
|
||||
| local-channels = 10
|
||||
| network-nodes = 2
|
||||
| network-channels = 0
|
||||
| }
|
||||
| }
|
||||
|}""".stripMargin)
|
||||
val config = safetyConfig.withFallback(baseConfig)
|
||||
intercept[IllegalArgumentException] {
|
||||
Databases.postgres(config, UUID.randomUUID(), datadir, LockFailureHandler.logAndThrow)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
test("migration test") {
|
||||
val pg = EmbeddedPostgres.start()
|
||||
using(pg.getPostgresDatabase.getConnection.createStatement()) { statement =>
|
||||
|
@ -218,6 +291,20 @@ object PgUtilsSpec extends Logging {
|
|||
| lock-timeout = 5 seconds // timeout for the lock statement on the lease table
|
||||
| auto-release-at-shutdown = false // automatically release the lock when eclair is stopping
|
||||
| }
|
||||
| safety-checks {
|
||||
| // a set of basic checks on data to make sure we use the correct database
|
||||
| enabled = false
|
||||
| max-age {
|
||||
| local-channels = 3 minutes
|
||||
| network-nodes = 30 minutes
|
||||
| audit-relayed = 10 minutes
|
||||
| }
|
||||
| min-count {
|
||||
| local-channels = 10
|
||||
| network-nodes = 3000
|
||||
| network-channels = 20000
|
||||
| }
|
||||
| }
|
||||
|}
|
||||
|""".stripMargin
|
||||
)
|
||||
|
|
Loading…
Add table
Reference in a new issue