mirror of
https://github.com/lightningnetwork/lnd.git
synced 2025-02-22 14:22:37 +01:00
Merge pull request #8504 from calvinrzachman/healthcheck
lnd/healthcheck: add checks after initialization + success/failure callbacks
This commit is contained in:
commit
d9887f3212
3 changed files with 280 additions and 19 deletions
|
@ -205,6 +205,9 @@
|
||||||
for blinded path payloads to allow fuzzing before LND fully supports
|
for blinded path payloads to allow fuzzing before LND fully supports
|
||||||
blinded payment relay.
|
blinded payment relay.
|
||||||
|
|
||||||
|
* Allow `healthcheck` package users to provide [custom callbacks](https://github.com/lightningnetwork/lnd/pull/8504)
|
||||||
|
which will execute whenever a healthcheck succeeds/fails.
|
||||||
|
|
||||||
### Logging
|
### Logging
|
||||||
* [Add the htlc amount](https://github.com/lightningnetwork/lnd/pull/8156) to
|
* [Add the htlc amount](https://github.com/lightningnetwork/lnd/pull/8156) to
|
||||||
contract court logs in case of timed-out htlcs in order to easily spot dust
|
contract court logs in case of timed-out htlcs in order to easily spot dust
|
||||||
|
|
|
@ -15,6 +15,8 @@ import (
|
||||||
"github.com/lightningnetwork/lnd/ticker"
|
"github.com/lightningnetwork/lnd/ticker"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var noOpCallback = func() {}
|
||||||
|
|
||||||
// Config contains configuration settings for our monitor.
|
// Config contains configuration settings for our monitor.
|
||||||
type Config struct {
|
type Config struct {
|
||||||
// Checks is a set of health checks that assert that lnd has access to
|
// Checks is a set of health checks that assert that lnd has access to
|
||||||
|
@ -96,6 +98,19 @@ func (m *Monitor) Stop() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AddCheck adds a new healthcheck to our monitor.
|
||||||
|
func (m *Monitor) AddCheck(check *Observation) error {
|
||||||
|
|
||||||
|
m.wg.Add(1)
|
||||||
|
go func(check *Observation) {
|
||||||
|
defer m.wg.Done()
|
||||||
|
|
||||||
|
check.monitor(m.cfg.Shutdown, m.quit)
|
||||||
|
}(check)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// CreateCheck is a helper function that takes a function that produces an error
|
// CreateCheck is a helper function that takes a function that produces an error
|
||||||
// and wraps it in a function that returns its result on an error channel.
|
// and wraps it in a function that returns its result on an error channel.
|
||||||
// We do not wait group the goroutine running our checkFunc because we expect
|
// We do not wait group the goroutine running our checkFunc because we expect
|
||||||
|
@ -137,13 +152,42 @@ type Observation struct {
|
||||||
// Backoff is the amount of time we back off between retries for failed
|
// Backoff is the amount of time we back off between retries for failed
|
||||||
// checks.
|
// checks.
|
||||||
Backoff time.Duration
|
Backoff time.Duration
|
||||||
|
|
||||||
|
// OnSuccess is a callback which will be executed when the healthcheck
|
||||||
|
// succeeds. This is optional.
|
||||||
|
OnSuccess func()
|
||||||
|
|
||||||
|
// OnFailure is a callback which will be executed when the healthcheck
|
||||||
|
// fails. This is optional.
|
||||||
|
OnFailure func()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ObservationOption describes the signature of a functional option that can be
|
||||||
|
// used to modify the behaviour of an Observation.
|
||||||
|
type ObservationOption func(*Observation)
|
||||||
|
|
||||||
|
// WithSuccessCallback configures an observation with a callback to be fired
|
||||||
|
// whenever the health check succeeds.
|
||||||
|
func WithSuccessCallback(callback func()) ObservationOption {
|
||||||
|
return func(o *Observation) {
|
||||||
|
o.OnSuccess = callback
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithFailureCallback configures an observation with a callback to be fired
|
||||||
|
// whenever the health check reaches its failure threshold.
|
||||||
|
func WithFailureCallback(callback func()) ObservationOption {
|
||||||
|
return func(o *Observation) {
|
||||||
|
o.OnFailure = callback
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewObservation creates an observation.
|
// NewObservation creates an observation.
|
||||||
func NewObservation(name string, check func() error, interval,
|
func NewObservation(name string, check func() error, interval, timeout,
|
||||||
timeout, backoff time.Duration, attempts int) *Observation {
|
backoff time.Duration, attempts int,
|
||||||
|
opts ...ObservationOption) *Observation {
|
||||||
|
|
||||||
return &Observation{
|
observation := &Observation{
|
||||||
Name: name,
|
Name: name,
|
||||||
Check: CreateCheck(check),
|
Check: CreateCheck(check),
|
||||||
Interval: ticker.New(interval),
|
Interval: ticker.New(interval),
|
||||||
|
@ -151,6 +195,22 @@ func NewObservation(name string, check func() error, interval,
|
||||||
Timeout: timeout,
|
Timeout: timeout,
|
||||||
Backoff: backoff,
|
Backoff: backoff,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Apply each option to the observation.
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(observation)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that we default to NO-OP callbacks.
|
||||||
|
if observation.OnSuccess == nil {
|
||||||
|
observation.OnSuccess = noOpCallback
|
||||||
|
}
|
||||||
|
|
||||||
|
if observation.OnFailure == nil {
|
||||||
|
observation.OnFailure = noOpCallback
|
||||||
|
}
|
||||||
|
|
||||||
|
return observation
|
||||||
}
|
}
|
||||||
|
|
||||||
// String returns a string representation of an observation.
|
// String returns a string representation of an observation.
|
||||||
|
@ -206,6 +266,17 @@ func (o *Observation) retryCheck(quit chan struct{},
|
||||||
var err error
|
var err error
|
||||||
select {
|
select {
|
||||||
case err = <-o.Check():
|
case err = <-o.Check():
|
||||||
|
// If our error is nil, we have passed our health check,
|
||||||
|
// so we'll invoke our success callback if defined and
|
||||||
|
// then exit.
|
||||||
|
if err == nil {
|
||||||
|
log.Debug("invoking success callback")
|
||||||
|
|
||||||
|
// Invoke the success callback.
|
||||||
|
o.OnSuccess()
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
case <-time.After(o.Timeout):
|
case <-time.After(o.Timeout):
|
||||||
err = fmt.Errorf("health check: %v timed out after: "+
|
err = fmt.Errorf("health check: %v timed out after: "+
|
||||||
|
@ -216,15 +287,14 @@ func (o *Observation) retryCheck(quit chan struct{},
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// If our error is nil, we have passed our health check, so we
|
|
||||||
// can exit.
|
|
||||||
if err == nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we have reached our allowed number of attempts, this
|
// If we have reached our allowed number of attempts, this
|
||||||
// check has failed so we request shutdown.
|
// check has failed so we'll fire the on failure callback
|
||||||
|
// and request shutdown.
|
||||||
if count == o.Attempts {
|
if count == o.Attempts {
|
||||||
|
log.Debug("invoking failure callback")
|
||||||
|
|
||||||
|
o.OnFailure()
|
||||||
|
|
||||||
shutdown("Health check: %v failed after %v "+
|
shutdown("Health check: %v failed after %v "+
|
||||||
"calls", o, o.Attempts)
|
"calls", o, o.Attempts)
|
||||||
return true
|
return true
|
||||||
|
|
|
@ -55,11 +55,13 @@ func TestMonitor(t *testing.T) {
|
||||||
cfg := &Config{
|
cfg := &Config{
|
||||||
Checks: []*Observation{
|
Checks: []*Observation{
|
||||||
{
|
{
|
||||||
Check: mock.call,
|
Check: mock.call,
|
||||||
Interval: intervalTicker,
|
Interval: intervalTicker,
|
||||||
Attempts: 2,
|
Attempts: 2,
|
||||||
Backoff: 0,
|
Backoff: 0,
|
||||||
Timeout: time.Hour,
|
Timeout: time.Hour,
|
||||||
|
OnSuccess: noOpCallback,
|
||||||
|
OnFailure: noOpCallback,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Shutdown: func(string, ...interface{}) {
|
Shutdown: func(string, ...interface{}) {
|
||||||
|
@ -202,10 +204,12 @@ func TestRetryCheck(t *testing.T) {
|
||||||
// function. We set a zero back off so that the test
|
// function. We set a zero back off so that the test
|
||||||
// will not wait.
|
// will not wait.
|
||||||
observation := &Observation{
|
observation := &Observation{
|
||||||
Check: mock.call,
|
Check: mock.call,
|
||||||
Attempts: test.attempts,
|
Attempts: test.attempts,
|
||||||
Timeout: test.timeout,
|
Timeout: test.timeout,
|
||||||
Backoff: 0,
|
Backoff: 0,
|
||||||
|
OnSuccess: noOpCallback,
|
||||||
|
OnFailure: noOpCallback,
|
||||||
}
|
}
|
||||||
quit := make(chan struct{})
|
quit := make(chan struct{})
|
||||||
|
|
||||||
|
@ -238,3 +242,187 @@ func TestRetryCheck(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestCallbacks verifies that we fire the OnSuccess/OnFailure callbacks
|
||||||
|
// as expected.
|
||||||
|
//
|
||||||
|
// - When the health check succeeds, the OnSuccess callback should fire.
|
||||||
|
// - When the failure threshold is reached, the OnFailure callback should fire.
|
||||||
|
func TestCallbacks(t *testing.T) {
|
||||||
|
intervalTicker := ticker.NewForce(time.Hour)
|
||||||
|
|
||||||
|
mock := newMockCheck(t)
|
||||||
|
failureThreshold := 3
|
||||||
|
|
||||||
|
successChan := make(chan struct{})
|
||||||
|
failChan := make(chan struct{})
|
||||||
|
shutdown := make(chan struct{})
|
||||||
|
|
||||||
|
// Create our config for monitoring. We will use a 0 back off so that
|
||||||
|
// out test does not need to wait.
|
||||||
|
observation := &Observation{
|
||||||
|
Check: mock.call,
|
||||||
|
Interval: intervalTicker,
|
||||||
|
Attempts: failureThreshold,
|
||||||
|
Backoff: 0,
|
||||||
|
Timeout: time.Hour,
|
||||||
|
OnSuccess: func() {
|
||||||
|
select {
|
||||||
|
case successChan <- struct{}{}:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("unable to fire onSuccess callback")
|
||||||
|
}
|
||||||
|
},
|
||||||
|
OnFailure: func() {
|
||||||
|
close(failChan)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := &Config{
|
||||||
|
Checks: []*Observation{observation},
|
||||||
|
Shutdown: func(string, ...interface{}) {
|
||||||
|
shutdown <- struct{}{}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
monitor := NewMonitor(cfg)
|
||||||
|
require.NoError(t, monitor.Start(), "could not start monitor")
|
||||||
|
|
||||||
|
// Tick is a helper we will use to tick our interval.
|
||||||
|
tick := func() {
|
||||||
|
select {
|
||||||
|
case intervalTicker.Force <- testTime:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("could not tick timer")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We expect that the onSuccess callback is fired after each successful
|
||||||
|
// check.
|
||||||
|
for i := 0; i < failureThreshold; i++ {
|
||||||
|
tick()
|
||||||
|
mock.sendError(nil)
|
||||||
|
|
||||||
|
// We expect that the onSuccess callback will have fired.
|
||||||
|
select {
|
||||||
|
case <-successChan:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("expected success callback")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kick off another health check iteration. The monitor's internal
|
||||||
|
// retry mechanism will re-attempt the check until it has reached
|
||||||
|
// the configured maximum # of attempts.
|
||||||
|
//
|
||||||
|
// This mocks our check function failing the maximum # of times
|
||||||
|
// consecutively that it is allowed.
|
||||||
|
tick()
|
||||||
|
for i := 1; i <= failureThreshold; i++ {
|
||||||
|
mock.sendError(errNonNil)
|
||||||
|
|
||||||
|
// Verify that the onFailure callback does not fire unless
|
||||||
|
// the failure threshold (maximum # of attempts) is reached.
|
||||||
|
if i < failureThreshold {
|
||||||
|
select {
|
||||||
|
case <-failChan:
|
||||||
|
t.Fatal("unexpected onFailure callback")
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// After reaching the failure threshold for this health check,
|
||||||
|
// we expect that the onFailure callback will have fired.
|
||||||
|
select {
|
||||||
|
case <-failChan:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("expected onFailure callback")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Since we have failed within our allowed number of retries, we now
|
||||||
|
// expect a call to our shutdown function.
|
||||||
|
select {
|
||||||
|
case <-shutdown:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("expected shutdown")
|
||||||
|
}
|
||||||
|
require.NoError(t, monitor.Stop(), "could not stop monitor")
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestDynamicChecks verifies that we actually kick off health check routines
|
||||||
|
// for observations that are added after starting the monitor.
|
||||||
|
func TestDynamicChecks(t *testing.T) {
|
||||||
|
intervalTicker := ticker.NewForce(time.Hour)
|
||||||
|
|
||||||
|
mock := newMockCheck(t)
|
||||||
|
|
||||||
|
successChan := make(chan struct{})
|
||||||
|
shutdown := make(chan struct{})
|
||||||
|
|
||||||
|
// Don't configure any health checks for this monitor.
|
||||||
|
// We'd like to verify that we can add checks after startup.
|
||||||
|
cfg := &Config{
|
||||||
|
Checks: []*Observation{},
|
||||||
|
Shutdown: func(string, ...interface{}) {
|
||||||
|
shutdown <- struct{}{}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
monitor := NewMonitor(cfg)
|
||||||
|
require.NoError(t, monitor.Start(), "could not start monitor")
|
||||||
|
|
||||||
|
// Tick is a helper we will use to tick our interval.
|
||||||
|
tick := func() {
|
||||||
|
select {
|
||||||
|
case intervalTicker.Force <- testTime:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("could not tick timer")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
observation := &Observation{
|
||||||
|
Check: mock.call,
|
||||||
|
Interval: intervalTicker,
|
||||||
|
Attempts: 2,
|
||||||
|
Backoff: 0,
|
||||||
|
Timeout: time.Hour,
|
||||||
|
OnSuccess: func() {
|
||||||
|
select {
|
||||||
|
case successChan <- struct{}{}:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("unable to fire onSuccess callback")
|
||||||
|
}
|
||||||
|
},
|
||||||
|
OnFailure: noOpCallback,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the check after having started the monitor.
|
||||||
|
err := monitor.AddCheck(observation)
|
||||||
|
require.NoError(t, err, "could not add new observation")
|
||||||
|
|
||||||
|
// This should initiate the check we dynamically added above.
|
||||||
|
tick()
|
||||||
|
|
||||||
|
// Verify that we can fire the OnSuccess callback.
|
||||||
|
mock.sendError(errNonNil)
|
||||||
|
mock.sendError(nil)
|
||||||
|
select {
|
||||||
|
case <-successChan:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("expected success callback")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that we correctly shutdown if the added health check fails.
|
||||||
|
tick()
|
||||||
|
mock.sendError(errNonNil)
|
||||||
|
mock.sendError(errNonNil)
|
||||||
|
|
||||||
|
// Since we have failed within our allowed number of retries, we now
|
||||||
|
// expect a call to our shutdown function.
|
||||||
|
select {
|
||||||
|
case <-shutdown:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("expected shutdown")
|
||||||
|
}
|
||||||
|
require.NoError(t, monitor.Stop(), "could not stop monitor")
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue