Merge pull request #8504 from calvinrzachman/healthcheck

lnd/healthcheck: add checks after initialization + success/failure callbacks
2025-02-22 14:22:37 +01:00 · 2024-03-13 20:20:39 +08:00 · 2024-03-13 20:20:39 +08:00 · d9887f3212
commit d9887f3212
parent 1fd6bc870b add2b2957b
3 changed files with 280 additions and 19 deletions
--- a/docs/release-notes/release-notes-0.18.0.md
+++ b/docs/release-notes/release-notes-0.18.0.md
@ -205,6 +205,9 @@
  for blinded path payloads to allow fuzzing before LND fully supports 
  blinded payment relay.
 * Allow `healthcheck` package users to provide [custom callbacks](https://github.com/lightningnetwork/lnd/pull/8504)
  which will execute whenever a healthcheck succeeds/fails.
 ### Logging
 * [Add the htlc amount](https://github.com/lightningnetwork/lnd/pull/8156) to
  contract court logs in case of timed-out htlcs in order to easily spot dust
--- a/healthcheck/healthcheck.go
+++ b/healthcheck/healthcheck.go
@ -15,6 +15,8 @@ import (
 	"github.com/lightningnetwork/lnd/ticker"
 )
 var noOpCallback = func() {}
 // Config contains configuration settings for our monitor.
 type Config struct {
 	// Checks is a set of health checks that assert that lnd has access to
@ -96,6 +98,19 @@ func (m *Monitor) Stop() error {
 	return nil
 }
 // AddCheck adds a new healthcheck to our monitor.
 func (m *Monitor) AddCheck(check *Observation) error {
 	m.wg.Add(1)
 	go func(check *Observation) {
 		defer m.wg.Done()
 		check.monitor(m.cfg.Shutdown, m.quit)
 	}(check)
 	return nil
 }
 // CreateCheck is a helper function that takes a function that produces an error
 // and wraps it in a function that returns its result on an error channel.
 // We do not wait group the goroutine running our checkFunc because we expect
@ -137,13 +152,42 @@ type Observation struct {
 	// Backoff is the amount of time we back off between retries for failed
 	// checks.
 	Backoff time.Duration
 	// OnSuccess is a callback which will be executed when the healthcheck
 	// succeeds. This is optional.
 	OnSuccess func()
 	// OnFailure is a callback which will be executed when the healthcheck
 	// fails. This is optional.
 	OnFailure func()
 }
 // ObservationOption describes the signature of a functional option that can be
 // used to modify the behaviour of an Observation.
 type ObservationOption func(*Observation)
 // WithSuccessCallback configures an observation with a callback to be fired
 // whenever the health check succeeds.
 func WithSuccessCallback(callback func()) ObservationOption {
 	return func(o *Observation) {
 		o.OnSuccess = callback
 	}
 }
 // WithFailureCallback configures an observation with a callback to be fired
 // whenever the health check reaches its failure threshold.
 func WithFailureCallback(callback func()) ObservationOption {
 	return func(o *Observation) {
 		o.OnFailure = callback
 	}
 }
 // NewObservation creates an observation.
-func NewObservation(name string, check func() error, interval,
+func NewObservation(name string, check func() error, interval, timeout,
-	timeout, backoff time.Duration, attempts int) *Observation {
+	backoff time.Duration, attempts int,
 	opts ...ObservationOption) *Observation {
-	return &Observation{
+	observation := &Observation{
 		Name:     name,
 		Check:    CreateCheck(check),
 		Interval: ticker.New(interval),
@ -151,6 +195,22 @@ func NewObservation(name string, check func() error, interval,
 		Timeout:  timeout,
 		Backoff:  backoff,
 	}
 	// Apply each option to the observation.
 	for _, opt := range opts {
 		opt(observation)
 	}
 	// Ensure that we default to NO-OP callbacks.
 	if observation.OnSuccess == nil {
 		observation.OnSuccess = noOpCallback
 	}
 	if observation.OnFailure == nil {
 		observation.OnFailure = noOpCallback
 	}
 	return observation
 }
 // String returns a string representation of an observation.
@ -206,6 +266,17 @@ func (o *Observation) retryCheck(quit chan struct{},
 		var err error
 		select {
 		case err = <-o.Check():
 			// If our error is nil, we have passed our health check,
 			// so we'll invoke our success callback if defined and
 			// then exit.
 			if err == nil {
 				log.Debug("invoking success callback")
 				// Invoke the success callback.
 				o.OnSuccess()
 				return false
 			}
 		case <-time.After(o.Timeout):
 			err = fmt.Errorf("health check: %v timed out after: "+
@ -216,15 +287,14 @@ func (o *Observation) retryCheck(quit chan struct{},
 			return false
 		}
 		// If our error is nil, we have passed our health check, so we
 		// can exit.
 		if err == nil {
 			return false
 		}
 		// If we have reached our allowed number of attempts, this
-		// check has failed so we request shutdown.
+		// check has failed so we'll fire the on failure callback
 		// and request shutdown.
 		if count == o.Attempts {
 			log.Debug("invoking failure callback")
 			o.OnFailure()
 			shutdown("Health check: %v failed after %v "+
 				"calls", o, o.Attempts)
 			return true
--- a/healthcheck/healthcheck_test.go
+++ b/healthcheck/healthcheck_test.go
@ -55,11 +55,13 @@ func TestMonitor(t *testing.T) {
 	cfg := &Config{
 		Checks: []*Observation{
 			{
-				Check:    mock.call,
+				Check:     mock.call,
-				Interval: intervalTicker,
+				Interval:  intervalTicker,
-				Attempts: 2,
+				Attempts:  2,
-				Backoff:  0,
+				Backoff:   0,
-				Timeout:  time.Hour,
+				Timeout:   time.Hour,
 				OnSuccess: noOpCallback,
 				OnFailure: noOpCallback,
 			},
 		},
 		Shutdown: func(string, ...interface{}) {
@ -202,10 +204,12 @@ func TestRetryCheck(t *testing.T) {
 			// function. We set a zero back off so that the test
 			// will not wait.
 			observation := &Observation{
-				Check:    mock.call,
+				Check:     mock.call,
-				Attempts: test.attempts,
+				Attempts:  test.attempts,
-				Timeout:  test.timeout,
+				Timeout:   test.timeout,
-				Backoff:  0,
+				Backoff:   0,
 				OnSuccess: noOpCallback,
 				OnFailure: noOpCallback,
 			}
 			quit := make(chan struct{})
@ -238,3 +242,187 @@ func TestRetryCheck(t *testing.T) {
 		})
 	}
 }
 // TestCallbacks verifies that we fire the OnSuccess/OnFailure callbacks
 // as expected.
 //
 // - When the health check succeeds, the OnSuccess callback should fire.
 // - When the failure threshold is reached, the OnFailure callback should fire.
 func TestCallbacks(t *testing.T) {
 	intervalTicker := ticker.NewForce(time.Hour)
 	mock := newMockCheck(t)
 	failureThreshold := 3
 	successChan := make(chan struct{})
 	failChan := make(chan struct{})
 	shutdown := make(chan struct{})
 	// Create our config for monitoring. We will use a 0 back off so that
 	// out test does not need to wait.
 	observation := &Observation{
 		Check:    mock.call,
 		Interval: intervalTicker,
 		Attempts: failureThreshold,
 		Backoff:  0,
 		Timeout:  time.Hour,
 		OnSuccess: func() {
 			select {
 			case successChan <- struct{}{}:
 			case <-time.After(timeout):
 				t.Fatal("unable to fire onSuccess callback")
 			}
 		},
 		OnFailure: func() {
 			close(failChan)
 		},
 	}
 	cfg := &Config{
 		Checks: []*Observation{observation},
 		Shutdown: func(string, ...interface{}) {
 			shutdown <- struct{}{}
 		},
 	}
 	monitor := NewMonitor(cfg)
 	require.NoError(t, monitor.Start(), "could not start monitor")
 	// Tick is a helper we will use to tick our interval.
 	tick := func() {
 		select {
 		case intervalTicker.Force <- testTime:
 		case <-time.After(timeout):
 			t.Fatal("could not tick timer")
 		}
 	}
 	// We expect that the onSuccess callback is fired after each successful
 	// check.
 	for i := 0; i < failureThreshold; i++ {
 		tick()
 		mock.sendError(nil)
 		// We expect that the onSuccess callback will have fired.
 		select {
 		case <-successChan:
 		case <-time.After(timeout):
 			t.Fatal("expected success callback")
 		}
 	}
 	// Kick off another health check iteration. The monitor's internal
 	// retry mechanism will re-attempt the check until it has reached
 	// the configured maximum # of attempts.
 	//
 	// This mocks our check function failing the maximum # of times
 	// consecutively that it is allowed.
 	tick()
 	for i := 1; i <= failureThreshold; i++ {
 		mock.sendError(errNonNil)
 		// Verify that the onFailure callback does not fire unless
 		// the failure threshold (maximum # of attempts) is reached.
 		if i < failureThreshold {
 			select {
 			case <-failChan:
 				t.Fatal("unexpected onFailure callback")
 			default:
 			}
 		}
 	}
 	// After reaching the failure threshold for this health check,
 	// we expect that the onFailure callback will have fired.
 	select {
 	case <-failChan:
 	case <-time.After(timeout):
 		t.Fatal("expected onFailure callback")
 	}
 	// Since we have failed within our allowed number of retries, we now
 	// expect a call to our shutdown function.
 	select {
 	case <-shutdown:
 	case <-time.After(timeout):
 		t.Fatal("expected shutdown")
 	}
 	require.NoError(t, monitor.Stop(), "could not stop monitor")
 }
 // TestDynamicChecks verifies that we actually kick off health check routines
 // for observations that are added after starting the monitor.
 func TestDynamicChecks(t *testing.T) {
 	intervalTicker := ticker.NewForce(time.Hour)
 	mock := newMockCheck(t)
 	successChan := make(chan struct{})
 	shutdown := make(chan struct{})
 	// Don't configure any health checks for this monitor.
 	// We'd like to verify that we can add checks after startup.
 	cfg := &Config{
 		Checks: []*Observation{},
 		Shutdown: func(string, ...interface{}) {
 			shutdown <- struct{}{}
 		},
 	}
 	monitor := NewMonitor(cfg)
 	require.NoError(t, monitor.Start(), "could not start monitor")
 	// Tick is a helper we will use to tick our interval.
 	tick := func() {
 		select {
 		case intervalTicker.Force <- testTime:
 		case <-time.After(timeout):
 			t.Fatal("could not tick timer")
 		}
 	}
 	observation := &Observation{
 		Check:    mock.call,
 		Interval: intervalTicker,
 		Attempts: 2,
 		Backoff:  0,
 		Timeout:  time.Hour,
 		OnSuccess: func() {
 			select {
 			case successChan <- struct{}{}:
 			case <-time.After(timeout):
 				t.Fatal("unable to fire onSuccess callback")
 			}
 		},
 		OnFailure: noOpCallback,
 	}
 	// Add the check after having started the monitor.
 	err := monitor.AddCheck(observation)
 	require.NoError(t, err, "could not add new observation")
 	// This should initiate the check we dynamically added above.
 	tick()
 	// Verify that we can fire the OnSuccess callback.
 	mock.sendError(errNonNil)
 	mock.sendError(nil)
 	select {
 	case <-successChan:
 	case <-time.After(timeout):
 		t.Fatal("expected success callback")
 	}
 	// Verify that we correctly shutdown if the added health check fails.
 	tick()
 	mock.sendError(errNonNil)
 	mock.sendError(errNonNil)
 	// Since we have failed within our allowed number of retries, we now
 	// expect a call to our shutdown function.
 	select {
 	case <-shutdown:
 	case <-time.After(timeout):
 		t.Fatal("expected shutdown")
 	}
 	require.NoError(t, monitor.Stop(), "could not stop monitor")
 }