diff --git a/docs/release-notes/release-notes-0.18.0.md b/docs/release-notes/release-notes-0.18.0.md index 7e9514d0d..3e27c5247 100644 --- a/docs/release-notes/release-notes-0.18.0.md +++ b/docs/release-notes/release-notes-0.18.0.md @@ -205,6 +205,9 @@ for blinded path payloads to allow fuzzing before LND fully supports blinded payment relay. +* Allow `healthcheck` package users to provide [custom callbacks](https://github.com/lightningnetwork/lnd/pull/8504) + which will execute whenever a healthcheck succeeds/fails. + ### Logging * [Add the htlc amount](https://github.com/lightningnetwork/lnd/pull/8156) to contract court logs in case of timed-out htlcs in order to easily spot dust diff --git a/healthcheck/healthcheck.go b/healthcheck/healthcheck.go index e72d65620..e15cbd4cb 100644 --- a/healthcheck/healthcheck.go +++ b/healthcheck/healthcheck.go @@ -15,6 +15,8 @@ import ( "github.com/lightningnetwork/lnd/ticker" ) +var noOpCallback = func() {} + // Config contains configuration settings for our monitor. type Config struct { // Checks is a set of health checks that assert that lnd has access to @@ -96,6 +98,19 @@ func (m *Monitor) Stop() error { return nil } +// AddCheck adds a new healthcheck to our monitor. +func (m *Monitor) AddCheck(check *Observation) error { + + m.wg.Add(1) + go func(check *Observation) { + defer m.wg.Done() + + check.monitor(m.cfg.Shutdown, m.quit) + }(check) + + return nil +} + // CreateCheck is a helper function that takes a function that produces an error // and wraps it in a function that returns its result on an error channel. // We do not wait group the goroutine running our checkFunc because we expect @@ -137,13 +152,42 @@ type Observation struct { // Backoff is the amount of time we back off between retries for failed // checks. Backoff time.Duration + + // OnSuccess is a callback which will be executed when the healthcheck + // succeeds. This is optional. + OnSuccess func() + + // OnFailure is a callback which will be executed when the healthcheck + // fails. This is optional. + OnFailure func() +} + +// ObservationOption describes the signature of a functional option that can be +// used to modify the behaviour of an Observation. +type ObservationOption func(*Observation) + +// WithSuccessCallback configures an observation with a callback to be fired +// whenever the health check succeeds. +func WithSuccessCallback(callback func()) ObservationOption { + return func(o *Observation) { + o.OnSuccess = callback + } +} + +// WithFailureCallback configures an observation with a callback to be fired +// whenever the health check reaches its failure threshold. +func WithFailureCallback(callback func()) ObservationOption { + return func(o *Observation) { + o.OnFailure = callback + } } // NewObservation creates an observation. -func NewObservation(name string, check func() error, interval, - timeout, backoff time.Duration, attempts int) *Observation { +func NewObservation(name string, check func() error, interval, timeout, + backoff time.Duration, attempts int, + opts ...ObservationOption) *Observation { - return &Observation{ + observation := &Observation{ Name: name, Check: CreateCheck(check), Interval: ticker.New(interval), @@ -151,6 +195,22 @@ func NewObservation(name string, check func() error, interval, Timeout: timeout, Backoff: backoff, } + + // Apply each option to the observation. + for _, opt := range opts { + opt(observation) + } + + // Ensure that we default to NO-OP callbacks. + if observation.OnSuccess == nil { + observation.OnSuccess = noOpCallback + } + + if observation.OnFailure == nil { + observation.OnFailure = noOpCallback + } + + return observation } // String returns a string representation of an observation. @@ -206,6 +266,17 @@ func (o *Observation) retryCheck(quit chan struct{}, var err error select { case err = <-o.Check(): + // If our error is nil, we have passed our health check, + // so we'll invoke our success callback if defined and + // then exit. + if err == nil { + log.Debug("invoking success callback") + + // Invoke the success callback. + o.OnSuccess() + + return false + } case <-time.After(o.Timeout): err = fmt.Errorf("health check: %v timed out after: "+ @@ -216,15 +287,14 @@ func (o *Observation) retryCheck(quit chan struct{}, return false } - // If our error is nil, we have passed our health check, so we - // can exit. - if err == nil { - return false - } - // If we have reached our allowed number of attempts, this - // check has failed so we request shutdown. + // check has failed so we'll fire the on failure callback + // and request shutdown. if count == o.Attempts { + log.Debug("invoking failure callback") + + o.OnFailure() + shutdown("Health check: %v failed after %v "+ "calls", o, o.Attempts) return true diff --git a/healthcheck/healthcheck_test.go b/healthcheck/healthcheck_test.go index b198e60a5..cb1a5952c 100644 --- a/healthcheck/healthcheck_test.go +++ b/healthcheck/healthcheck_test.go @@ -55,11 +55,13 @@ func TestMonitor(t *testing.T) { cfg := &Config{ Checks: []*Observation{ { - Check: mock.call, - Interval: intervalTicker, - Attempts: 2, - Backoff: 0, - Timeout: time.Hour, + Check: mock.call, + Interval: intervalTicker, + Attempts: 2, + Backoff: 0, + Timeout: time.Hour, + OnSuccess: noOpCallback, + OnFailure: noOpCallback, }, }, Shutdown: func(string, ...interface{}) { @@ -202,10 +204,12 @@ func TestRetryCheck(t *testing.T) { // function. We set a zero back off so that the test // will not wait. observation := &Observation{ - Check: mock.call, - Attempts: test.attempts, - Timeout: test.timeout, - Backoff: 0, + Check: mock.call, + Attempts: test.attempts, + Timeout: test.timeout, + Backoff: 0, + OnSuccess: noOpCallback, + OnFailure: noOpCallback, } quit := make(chan struct{}) @@ -238,3 +242,187 @@ func TestRetryCheck(t *testing.T) { }) } } + +// TestCallbacks verifies that we fire the OnSuccess/OnFailure callbacks +// as expected. +// +// - When the health check succeeds, the OnSuccess callback should fire. +// - When the failure threshold is reached, the OnFailure callback should fire. +func TestCallbacks(t *testing.T) { + intervalTicker := ticker.NewForce(time.Hour) + + mock := newMockCheck(t) + failureThreshold := 3 + + successChan := make(chan struct{}) + failChan := make(chan struct{}) + shutdown := make(chan struct{}) + + // Create our config for monitoring. We will use a 0 back off so that + // out test does not need to wait. + observation := &Observation{ + Check: mock.call, + Interval: intervalTicker, + Attempts: failureThreshold, + Backoff: 0, + Timeout: time.Hour, + OnSuccess: func() { + select { + case successChan <- struct{}{}: + case <-time.After(timeout): + t.Fatal("unable to fire onSuccess callback") + } + }, + OnFailure: func() { + close(failChan) + }, + } + + cfg := &Config{ + Checks: []*Observation{observation}, + Shutdown: func(string, ...interface{}) { + shutdown <- struct{}{} + }, + } + monitor := NewMonitor(cfg) + require.NoError(t, monitor.Start(), "could not start monitor") + + // Tick is a helper we will use to tick our interval. + tick := func() { + select { + case intervalTicker.Force <- testTime: + case <-time.After(timeout): + t.Fatal("could not tick timer") + } + } + + // We expect that the onSuccess callback is fired after each successful + // check. + for i := 0; i < failureThreshold; i++ { + tick() + mock.sendError(nil) + + // We expect that the onSuccess callback will have fired. + select { + case <-successChan: + case <-time.After(timeout): + t.Fatal("expected success callback") + } + + } + + // Kick off another health check iteration. The monitor's internal + // retry mechanism will re-attempt the check until it has reached + // the configured maximum # of attempts. + // + // This mocks our check function failing the maximum # of times + // consecutively that it is allowed. + tick() + for i := 1; i <= failureThreshold; i++ { + mock.sendError(errNonNil) + + // Verify that the onFailure callback does not fire unless + // the failure threshold (maximum # of attempts) is reached. + if i < failureThreshold { + select { + case <-failChan: + t.Fatal("unexpected onFailure callback") + default: + } + } + } + + // After reaching the failure threshold for this health check, + // we expect that the onFailure callback will have fired. + select { + case <-failChan: + case <-time.After(timeout): + t.Fatal("expected onFailure callback") + } + + // Since we have failed within our allowed number of retries, we now + // expect a call to our shutdown function. + select { + case <-shutdown: + case <-time.After(timeout): + t.Fatal("expected shutdown") + } + require.NoError(t, monitor.Stop(), "could not stop monitor") +} + +// TestDynamicChecks verifies that we actually kick off health check routines +// for observations that are added after starting the monitor. +func TestDynamicChecks(t *testing.T) { + intervalTicker := ticker.NewForce(time.Hour) + + mock := newMockCheck(t) + + successChan := make(chan struct{}) + shutdown := make(chan struct{}) + + // Don't configure any health checks for this monitor. + // We'd like to verify that we can add checks after startup. + cfg := &Config{ + Checks: []*Observation{}, + Shutdown: func(string, ...interface{}) { + shutdown <- struct{}{} + }, + } + monitor := NewMonitor(cfg) + require.NoError(t, monitor.Start(), "could not start monitor") + + // Tick is a helper we will use to tick our interval. + tick := func() { + select { + case intervalTicker.Force <- testTime: + case <-time.After(timeout): + t.Fatal("could not tick timer") + } + } + + observation := &Observation{ + Check: mock.call, + Interval: intervalTicker, + Attempts: 2, + Backoff: 0, + Timeout: time.Hour, + OnSuccess: func() { + select { + case successChan <- struct{}{}: + case <-time.After(timeout): + t.Fatal("unable to fire onSuccess callback") + } + }, + OnFailure: noOpCallback, + } + + // Add the check after having started the monitor. + err := monitor.AddCheck(observation) + require.NoError(t, err, "could not add new observation") + + // This should initiate the check we dynamically added above. + tick() + + // Verify that we can fire the OnSuccess callback. + mock.sendError(errNonNil) + mock.sendError(nil) + select { + case <-successChan: + case <-time.After(timeout): + t.Fatal("expected success callback") + } + + // Verify that we correctly shutdown if the added health check fails. + tick() + mock.sendError(errNonNil) + mock.sendError(errNonNil) + + // Since we have failed within our allowed number of retries, we now + // expect a call to our shutdown function. + select { + case <-shutdown: + case <-time.After(timeout): + t.Fatal("expected shutdown") + } + require.NoError(t, monitor.Stop(), "could not stop monitor") +}