mirror of
https://github.com/lightningnetwork/lnd.git
synced 2025-02-21 14:04:06 +01:00
Merge pull request #8504 from calvinrzachman/healthcheck
lnd/healthcheck: add checks after initialization + success/failure callbacks
This commit is contained in:
commit
d9887f3212
3 changed files with 280 additions and 19 deletions
|
@ -205,6 +205,9 @@
|
|||
for blinded path payloads to allow fuzzing before LND fully supports
|
||||
blinded payment relay.
|
||||
|
||||
* Allow `healthcheck` package users to provide [custom callbacks](https://github.com/lightningnetwork/lnd/pull/8504)
|
||||
which will execute whenever a healthcheck succeeds/fails.
|
||||
|
||||
### Logging
|
||||
* [Add the htlc amount](https://github.com/lightningnetwork/lnd/pull/8156) to
|
||||
contract court logs in case of timed-out htlcs in order to easily spot dust
|
||||
|
|
|
@ -15,6 +15,8 @@ import (
|
|||
"github.com/lightningnetwork/lnd/ticker"
|
||||
)
|
||||
|
||||
var noOpCallback = func() {}
|
||||
|
||||
// Config contains configuration settings for our monitor.
|
||||
type Config struct {
|
||||
// Checks is a set of health checks that assert that lnd has access to
|
||||
|
@ -96,6 +98,19 @@ func (m *Monitor) Stop() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// AddCheck adds a new healthcheck to our monitor.
|
||||
func (m *Monitor) AddCheck(check *Observation) error {
|
||||
|
||||
m.wg.Add(1)
|
||||
go func(check *Observation) {
|
||||
defer m.wg.Done()
|
||||
|
||||
check.monitor(m.cfg.Shutdown, m.quit)
|
||||
}(check)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// CreateCheck is a helper function that takes a function that produces an error
|
||||
// and wraps it in a function that returns its result on an error channel.
|
||||
// We do not wait group the goroutine running our checkFunc because we expect
|
||||
|
@ -137,13 +152,42 @@ type Observation struct {
|
|||
// Backoff is the amount of time we back off between retries for failed
|
||||
// checks.
|
||||
Backoff time.Duration
|
||||
|
||||
// OnSuccess is a callback which will be executed when the healthcheck
|
||||
// succeeds. This is optional.
|
||||
OnSuccess func()
|
||||
|
||||
// OnFailure is a callback which will be executed when the healthcheck
|
||||
// fails. This is optional.
|
||||
OnFailure func()
|
||||
}
|
||||
|
||||
// ObservationOption describes the signature of a functional option that can be
|
||||
// used to modify the behaviour of an Observation.
|
||||
type ObservationOption func(*Observation)
|
||||
|
||||
// WithSuccessCallback configures an observation with a callback to be fired
|
||||
// whenever the health check succeeds.
|
||||
func WithSuccessCallback(callback func()) ObservationOption {
|
||||
return func(o *Observation) {
|
||||
o.OnSuccess = callback
|
||||
}
|
||||
}
|
||||
|
||||
// WithFailureCallback configures an observation with a callback to be fired
|
||||
// whenever the health check reaches its failure threshold.
|
||||
func WithFailureCallback(callback func()) ObservationOption {
|
||||
return func(o *Observation) {
|
||||
o.OnFailure = callback
|
||||
}
|
||||
}
|
||||
|
||||
// NewObservation creates an observation.
|
||||
func NewObservation(name string, check func() error, interval,
|
||||
timeout, backoff time.Duration, attempts int) *Observation {
|
||||
func NewObservation(name string, check func() error, interval, timeout,
|
||||
backoff time.Duration, attempts int,
|
||||
opts ...ObservationOption) *Observation {
|
||||
|
||||
return &Observation{
|
||||
observation := &Observation{
|
||||
Name: name,
|
||||
Check: CreateCheck(check),
|
||||
Interval: ticker.New(interval),
|
||||
|
@ -151,6 +195,22 @@ func NewObservation(name string, check func() error, interval,
|
|||
Timeout: timeout,
|
||||
Backoff: backoff,
|
||||
}
|
||||
|
||||
// Apply each option to the observation.
|
||||
for _, opt := range opts {
|
||||
opt(observation)
|
||||
}
|
||||
|
||||
// Ensure that we default to NO-OP callbacks.
|
||||
if observation.OnSuccess == nil {
|
||||
observation.OnSuccess = noOpCallback
|
||||
}
|
||||
|
||||
if observation.OnFailure == nil {
|
||||
observation.OnFailure = noOpCallback
|
||||
}
|
||||
|
||||
return observation
|
||||
}
|
||||
|
||||
// String returns a string representation of an observation.
|
||||
|
@ -206,6 +266,17 @@ func (o *Observation) retryCheck(quit chan struct{},
|
|||
var err error
|
||||
select {
|
||||
case err = <-o.Check():
|
||||
// If our error is nil, we have passed our health check,
|
||||
// so we'll invoke our success callback if defined and
|
||||
// then exit.
|
||||
if err == nil {
|
||||
log.Debug("invoking success callback")
|
||||
|
||||
// Invoke the success callback.
|
||||
o.OnSuccess()
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
case <-time.After(o.Timeout):
|
||||
err = fmt.Errorf("health check: %v timed out after: "+
|
||||
|
@ -216,15 +287,14 @@ func (o *Observation) retryCheck(quit chan struct{},
|
|||
return false
|
||||
}
|
||||
|
||||
// If our error is nil, we have passed our health check, so we
|
||||
// can exit.
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// If we have reached our allowed number of attempts, this
|
||||
// check has failed so we request shutdown.
|
||||
// check has failed so we'll fire the on failure callback
|
||||
// and request shutdown.
|
||||
if count == o.Attempts {
|
||||
log.Debug("invoking failure callback")
|
||||
|
||||
o.OnFailure()
|
||||
|
||||
shutdown("Health check: %v failed after %v "+
|
||||
"calls", o, o.Attempts)
|
||||
return true
|
||||
|
|
|
@ -55,11 +55,13 @@ func TestMonitor(t *testing.T) {
|
|||
cfg := &Config{
|
||||
Checks: []*Observation{
|
||||
{
|
||||
Check: mock.call,
|
||||
Interval: intervalTicker,
|
||||
Attempts: 2,
|
||||
Backoff: 0,
|
||||
Timeout: time.Hour,
|
||||
Check: mock.call,
|
||||
Interval: intervalTicker,
|
||||
Attempts: 2,
|
||||
Backoff: 0,
|
||||
Timeout: time.Hour,
|
||||
OnSuccess: noOpCallback,
|
||||
OnFailure: noOpCallback,
|
||||
},
|
||||
},
|
||||
Shutdown: func(string, ...interface{}) {
|
||||
|
@ -202,10 +204,12 @@ func TestRetryCheck(t *testing.T) {
|
|||
// function. We set a zero back off so that the test
|
||||
// will not wait.
|
||||
observation := &Observation{
|
||||
Check: mock.call,
|
||||
Attempts: test.attempts,
|
||||
Timeout: test.timeout,
|
||||
Backoff: 0,
|
||||
Check: mock.call,
|
||||
Attempts: test.attempts,
|
||||
Timeout: test.timeout,
|
||||
Backoff: 0,
|
||||
OnSuccess: noOpCallback,
|
||||
OnFailure: noOpCallback,
|
||||
}
|
||||
quit := make(chan struct{})
|
||||
|
||||
|
@ -238,3 +242,187 @@ func TestRetryCheck(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestCallbacks verifies that we fire the OnSuccess/OnFailure callbacks
|
||||
// as expected.
|
||||
//
|
||||
// - When the health check succeeds, the OnSuccess callback should fire.
|
||||
// - When the failure threshold is reached, the OnFailure callback should fire.
|
||||
func TestCallbacks(t *testing.T) {
|
||||
intervalTicker := ticker.NewForce(time.Hour)
|
||||
|
||||
mock := newMockCheck(t)
|
||||
failureThreshold := 3
|
||||
|
||||
successChan := make(chan struct{})
|
||||
failChan := make(chan struct{})
|
||||
shutdown := make(chan struct{})
|
||||
|
||||
// Create our config for monitoring. We will use a 0 back off so that
|
||||
// out test does not need to wait.
|
||||
observation := &Observation{
|
||||
Check: mock.call,
|
||||
Interval: intervalTicker,
|
||||
Attempts: failureThreshold,
|
||||
Backoff: 0,
|
||||
Timeout: time.Hour,
|
||||
OnSuccess: func() {
|
||||
select {
|
||||
case successChan <- struct{}{}:
|
||||
case <-time.After(timeout):
|
||||
t.Fatal("unable to fire onSuccess callback")
|
||||
}
|
||||
},
|
||||
OnFailure: func() {
|
||||
close(failChan)
|
||||
},
|
||||
}
|
||||
|
||||
cfg := &Config{
|
||||
Checks: []*Observation{observation},
|
||||
Shutdown: func(string, ...interface{}) {
|
||||
shutdown <- struct{}{}
|
||||
},
|
||||
}
|
||||
monitor := NewMonitor(cfg)
|
||||
require.NoError(t, monitor.Start(), "could not start monitor")
|
||||
|
||||
// Tick is a helper we will use to tick our interval.
|
||||
tick := func() {
|
||||
select {
|
||||
case intervalTicker.Force <- testTime:
|
||||
case <-time.After(timeout):
|
||||
t.Fatal("could not tick timer")
|
||||
}
|
||||
}
|
||||
|
||||
// We expect that the onSuccess callback is fired after each successful
|
||||
// check.
|
||||
for i := 0; i < failureThreshold; i++ {
|
||||
tick()
|
||||
mock.sendError(nil)
|
||||
|
||||
// We expect that the onSuccess callback will have fired.
|
||||
select {
|
||||
case <-successChan:
|
||||
case <-time.After(timeout):
|
||||
t.Fatal("expected success callback")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Kick off another health check iteration. The monitor's internal
|
||||
// retry mechanism will re-attempt the check until it has reached
|
||||
// the configured maximum # of attempts.
|
||||
//
|
||||
// This mocks our check function failing the maximum # of times
|
||||
// consecutively that it is allowed.
|
||||
tick()
|
||||
for i := 1; i <= failureThreshold; i++ {
|
||||
mock.sendError(errNonNil)
|
||||
|
||||
// Verify that the onFailure callback does not fire unless
|
||||
// the failure threshold (maximum # of attempts) is reached.
|
||||
if i < failureThreshold {
|
||||
select {
|
||||
case <-failChan:
|
||||
t.Fatal("unexpected onFailure callback")
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// After reaching the failure threshold for this health check,
|
||||
// we expect that the onFailure callback will have fired.
|
||||
select {
|
||||
case <-failChan:
|
||||
case <-time.After(timeout):
|
||||
t.Fatal("expected onFailure callback")
|
||||
}
|
||||
|
||||
// Since we have failed within our allowed number of retries, we now
|
||||
// expect a call to our shutdown function.
|
||||
select {
|
||||
case <-shutdown:
|
||||
case <-time.After(timeout):
|
||||
t.Fatal("expected shutdown")
|
||||
}
|
||||
require.NoError(t, monitor.Stop(), "could not stop monitor")
|
||||
}
|
||||
|
||||
// TestDynamicChecks verifies that we actually kick off health check routines
|
||||
// for observations that are added after starting the monitor.
|
||||
func TestDynamicChecks(t *testing.T) {
|
||||
intervalTicker := ticker.NewForce(time.Hour)
|
||||
|
||||
mock := newMockCheck(t)
|
||||
|
||||
successChan := make(chan struct{})
|
||||
shutdown := make(chan struct{})
|
||||
|
||||
// Don't configure any health checks for this monitor.
|
||||
// We'd like to verify that we can add checks after startup.
|
||||
cfg := &Config{
|
||||
Checks: []*Observation{},
|
||||
Shutdown: func(string, ...interface{}) {
|
||||
shutdown <- struct{}{}
|
||||
},
|
||||
}
|
||||
monitor := NewMonitor(cfg)
|
||||
require.NoError(t, monitor.Start(), "could not start monitor")
|
||||
|
||||
// Tick is a helper we will use to tick our interval.
|
||||
tick := func() {
|
||||
select {
|
||||
case intervalTicker.Force <- testTime:
|
||||
case <-time.After(timeout):
|
||||
t.Fatal("could not tick timer")
|
||||
}
|
||||
}
|
||||
|
||||
observation := &Observation{
|
||||
Check: mock.call,
|
||||
Interval: intervalTicker,
|
||||
Attempts: 2,
|
||||
Backoff: 0,
|
||||
Timeout: time.Hour,
|
||||
OnSuccess: func() {
|
||||
select {
|
||||
case successChan <- struct{}{}:
|
||||
case <-time.After(timeout):
|
||||
t.Fatal("unable to fire onSuccess callback")
|
||||
}
|
||||
},
|
||||
OnFailure: noOpCallback,
|
||||
}
|
||||
|
||||
// Add the check after having started the monitor.
|
||||
err := monitor.AddCheck(observation)
|
||||
require.NoError(t, err, "could not add new observation")
|
||||
|
||||
// This should initiate the check we dynamically added above.
|
||||
tick()
|
||||
|
||||
// Verify that we can fire the OnSuccess callback.
|
||||
mock.sendError(errNonNil)
|
||||
mock.sendError(nil)
|
||||
select {
|
||||
case <-successChan:
|
||||
case <-time.After(timeout):
|
||||
t.Fatal("expected success callback")
|
||||
}
|
||||
|
||||
// Verify that we correctly shutdown if the added health check fails.
|
||||
tick()
|
||||
mock.sendError(errNonNil)
|
||||
mock.sendError(errNonNil)
|
||||
|
||||
// Since we have failed within our allowed number of retries, we now
|
||||
// expect a call to our shutdown function.
|
||||
select {
|
||||
case <-shutdown:
|
||||
case <-time.After(timeout):
|
||||
t.Fatal("expected shutdown")
|
||||
}
|
||||
require.NoError(t, monitor.Stop(), "could not stop monitor")
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue