Merge pull request #8504 from calvinrzachman/healthcheck

lnd/healthcheck: add checks after initialization + success/failure callbacks
This commit is contained in:
Yong 2024-03-13 20:20:39 +08:00 committed by GitHub
commit d9887f3212
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 280 additions and 19 deletions

View file

@ -205,6 +205,9 @@
for blinded path payloads to allow fuzzing before LND fully supports
blinded payment relay.
* Allow `healthcheck` package users to provide [custom callbacks](https://github.com/lightningnetwork/lnd/pull/8504)
which will execute whenever a healthcheck succeeds/fails.
### Logging
* [Add the htlc amount](https://github.com/lightningnetwork/lnd/pull/8156) to
contract court logs in case of timed-out htlcs in order to easily spot dust

View file

@ -15,6 +15,8 @@ import (
"github.com/lightningnetwork/lnd/ticker"
)
var noOpCallback = func() {}
// Config contains configuration settings for our monitor.
type Config struct {
// Checks is a set of health checks that assert that lnd has access to
@ -96,6 +98,19 @@ func (m *Monitor) Stop() error {
return nil
}
// AddCheck adds a new healthcheck to our monitor.
func (m *Monitor) AddCheck(check *Observation) error {
m.wg.Add(1)
go func(check *Observation) {
defer m.wg.Done()
check.monitor(m.cfg.Shutdown, m.quit)
}(check)
return nil
}
// CreateCheck is a helper function that takes a function that produces an error
// and wraps it in a function that returns its result on an error channel.
// We do not wait group the goroutine running our checkFunc because we expect
@ -137,13 +152,42 @@ type Observation struct {
// Backoff is the amount of time we back off between retries for failed
// checks.
Backoff time.Duration
// OnSuccess is a callback which will be executed when the healthcheck
// succeeds. This is optional.
OnSuccess func()
// OnFailure is a callback which will be executed when the healthcheck
// fails. This is optional.
OnFailure func()
}
// ObservationOption describes the signature of a functional option that can be
// used to modify the behaviour of an Observation.
type ObservationOption func(*Observation)
// WithSuccessCallback configures an observation with a callback to be fired
// whenever the health check succeeds.
func WithSuccessCallback(callback func()) ObservationOption {
return func(o *Observation) {
o.OnSuccess = callback
}
}
// WithFailureCallback configures an observation with a callback to be fired
// whenever the health check reaches its failure threshold.
func WithFailureCallback(callback func()) ObservationOption {
return func(o *Observation) {
o.OnFailure = callback
}
}
// NewObservation creates an observation.
func NewObservation(name string, check func() error, interval,
timeout, backoff time.Duration, attempts int) *Observation {
func NewObservation(name string, check func() error, interval, timeout,
backoff time.Duration, attempts int,
opts ...ObservationOption) *Observation {
return &Observation{
observation := &Observation{
Name: name,
Check: CreateCheck(check),
Interval: ticker.New(interval),
@ -151,6 +195,22 @@ func NewObservation(name string, check func() error, interval,
Timeout: timeout,
Backoff: backoff,
}
// Apply each option to the observation.
for _, opt := range opts {
opt(observation)
}
// Ensure that we default to NO-OP callbacks.
if observation.OnSuccess == nil {
observation.OnSuccess = noOpCallback
}
if observation.OnFailure == nil {
observation.OnFailure = noOpCallback
}
return observation
}
// String returns a string representation of an observation.
@ -206,6 +266,17 @@ func (o *Observation) retryCheck(quit chan struct{},
var err error
select {
case err = <-o.Check():
// If our error is nil, we have passed our health check,
// so we'll invoke our success callback if defined and
// then exit.
if err == nil {
log.Debug("invoking success callback")
// Invoke the success callback.
o.OnSuccess()
return false
}
case <-time.After(o.Timeout):
err = fmt.Errorf("health check: %v timed out after: "+
@ -216,15 +287,14 @@ func (o *Observation) retryCheck(quit chan struct{},
return false
}
// If our error is nil, we have passed our health check, so we
// can exit.
if err == nil {
return false
}
// If we have reached our allowed number of attempts, this
// check has failed so we request shutdown.
// check has failed so we'll fire the on failure callback
// and request shutdown.
if count == o.Attempts {
log.Debug("invoking failure callback")
o.OnFailure()
shutdown("Health check: %v failed after %v "+
"calls", o, o.Attempts)
return true

View file

@ -55,11 +55,13 @@ func TestMonitor(t *testing.T) {
cfg := &Config{
Checks: []*Observation{
{
Check: mock.call,
Interval: intervalTicker,
Attempts: 2,
Backoff: 0,
Timeout: time.Hour,
Check: mock.call,
Interval: intervalTicker,
Attempts: 2,
Backoff: 0,
Timeout: time.Hour,
OnSuccess: noOpCallback,
OnFailure: noOpCallback,
},
},
Shutdown: func(string, ...interface{}) {
@ -202,10 +204,12 @@ func TestRetryCheck(t *testing.T) {
// function. We set a zero back off so that the test
// will not wait.
observation := &Observation{
Check: mock.call,
Attempts: test.attempts,
Timeout: test.timeout,
Backoff: 0,
Check: mock.call,
Attempts: test.attempts,
Timeout: test.timeout,
Backoff: 0,
OnSuccess: noOpCallback,
OnFailure: noOpCallback,
}
quit := make(chan struct{})
@ -238,3 +242,187 @@ func TestRetryCheck(t *testing.T) {
})
}
}
// TestCallbacks verifies that we fire the OnSuccess/OnFailure callbacks
// as expected.
//
// - When the health check succeeds, the OnSuccess callback should fire.
// - When the failure threshold is reached, the OnFailure callback should fire.
func TestCallbacks(t *testing.T) {
intervalTicker := ticker.NewForce(time.Hour)
mock := newMockCheck(t)
failureThreshold := 3
successChan := make(chan struct{})
failChan := make(chan struct{})
shutdown := make(chan struct{})
// Create our config for monitoring. We will use a 0 back off so that
// out test does not need to wait.
observation := &Observation{
Check: mock.call,
Interval: intervalTicker,
Attempts: failureThreshold,
Backoff: 0,
Timeout: time.Hour,
OnSuccess: func() {
select {
case successChan <- struct{}{}:
case <-time.After(timeout):
t.Fatal("unable to fire onSuccess callback")
}
},
OnFailure: func() {
close(failChan)
},
}
cfg := &Config{
Checks: []*Observation{observation},
Shutdown: func(string, ...interface{}) {
shutdown <- struct{}{}
},
}
monitor := NewMonitor(cfg)
require.NoError(t, monitor.Start(), "could not start monitor")
// Tick is a helper we will use to tick our interval.
tick := func() {
select {
case intervalTicker.Force <- testTime:
case <-time.After(timeout):
t.Fatal("could not tick timer")
}
}
// We expect that the onSuccess callback is fired after each successful
// check.
for i := 0; i < failureThreshold; i++ {
tick()
mock.sendError(nil)
// We expect that the onSuccess callback will have fired.
select {
case <-successChan:
case <-time.After(timeout):
t.Fatal("expected success callback")
}
}
// Kick off another health check iteration. The monitor's internal
// retry mechanism will re-attempt the check until it has reached
// the configured maximum # of attempts.
//
// This mocks our check function failing the maximum # of times
// consecutively that it is allowed.
tick()
for i := 1; i <= failureThreshold; i++ {
mock.sendError(errNonNil)
// Verify that the onFailure callback does not fire unless
// the failure threshold (maximum # of attempts) is reached.
if i < failureThreshold {
select {
case <-failChan:
t.Fatal("unexpected onFailure callback")
default:
}
}
}
// After reaching the failure threshold for this health check,
// we expect that the onFailure callback will have fired.
select {
case <-failChan:
case <-time.After(timeout):
t.Fatal("expected onFailure callback")
}
// Since we have failed within our allowed number of retries, we now
// expect a call to our shutdown function.
select {
case <-shutdown:
case <-time.After(timeout):
t.Fatal("expected shutdown")
}
require.NoError(t, monitor.Stop(), "could not stop monitor")
}
// TestDynamicChecks verifies that we actually kick off health check routines
// for observations that are added after starting the monitor.
func TestDynamicChecks(t *testing.T) {
intervalTicker := ticker.NewForce(time.Hour)
mock := newMockCheck(t)
successChan := make(chan struct{})
shutdown := make(chan struct{})
// Don't configure any health checks for this monitor.
// We'd like to verify that we can add checks after startup.
cfg := &Config{
Checks: []*Observation{},
Shutdown: func(string, ...interface{}) {
shutdown <- struct{}{}
},
}
monitor := NewMonitor(cfg)
require.NoError(t, monitor.Start(), "could not start monitor")
// Tick is a helper we will use to tick our interval.
tick := func() {
select {
case intervalTicker.Force <- testTime:
case <-time.After(timeout):
t.Fatal("could not tick timer")
}
}
observation := &Observation{
Check: mock.call,
Interval: intervalTicker,
Attempts: 2,
Backoff: 0,
Timeout: time.Hour,
OnSuccess: func() {
select {
case successChan <- struct{}{}:
case <-time.After(timeout):
t.Fatal("unable to fire onSuccess callback")
}
},
OnFailure: noOpCallback,
}
// Add the check after having started the monitor.
err := monitor.AddCheck(observation)
require.NoError(t, err, "could not add new observation")
// This should initiate the check we dynamically added above.
tick()
// Verify that we can fire the OnSuccess callback.
mock.sendError(errNonNil)
mock.sendError(nil)
select {
case <-successChan:
case <-time.After(timeout):
t.Fatal("expected success callback")
}
// Verify that we correctly shutdown if the added health check fails.
tick()
mock.sendError(errNonNil)
mock.sendError(errNonNil)
// Since we have failed within our allowed number of retries, we now
// expect a call to our shutdown function.
select {
case <-shutdown:
case <-time.After(timeout):
t.Fatal("expected shutdown")
}
require.NoError(t, monitor.Stop(), "could not stop monitor")
}