lnd/healthcheck/healthcheck_test.go

package healthcheck

import (
	"errors"
	"testing"
	"time"

	"github.com/lightningnetwork/lnd/ticker"
	"github.com/stretchr/testify/require"
)

var (
	errNonNil = errors.New("non-nil test error")
	timeout   = time.Second
	testTime  = time.Unix(1, 2)
)

type mockedCheck struct {
	t       *testing.T
	errChan chan error
}

// newMockCheck creates a new mock.
func newMockCheck(t *testing.T) *mockedCheck {
	return &mockedCheck{
		t:       t,
		errChan: make(chan error),
	}
}

// call returns our mock's error channel, which we can send responses on.
func (m *mockedCheck) call() chan error {
	return m.errChan
}

// sendError sends an error into our mock's error channel, mocking the sending
// of a response from our check function.
func (m *mockedCheck) sendError(err error) {
	select {
	case m.errChan <- err:
	case <-time.After(timeout):
		m.t.Fatalf("could not send error: %v", err)
	}
}

// TestMonitor tests creation and triggering of a monitor with a health check.
func TestMonitor(t *testing.T) {
	intervalTicker := ticker.NewForce(time.Hour)

	mock := newMockCheck(t)
	shutdown := make(chan struct{})

	// Create our config for monitoring. We will use a 0 back off so that
	// out test does not need to wait.
	cfg := &Config{
		Checks: []*Observation{
			{
				Check:     mock.call,
				Interval:  intervalTicker,
				Attempts:  2,
				Backoff:   0,
				Timeout:   time.Hour,
				OnSuccess: noOpCallback,
				OnFailure: noOpCallback,
			},
		},
		Shutdown: func(string, ...interface{}) {
			shutdown <- struct{}{}
		},
	}
	monitor := NewMonitor(cfg)

	require.NoError(t, monitor.Start(), "could not start monitor")

	// Tick is a helper we will use to tick our interval.
	tick := func() {
		select {
		case intervalTicker.Force <- testTime:
		case <-time.After(timeout):
			t.Fatal("could not tick timer")
		}
	}

	// Tick our timer and provide our error channel with a nil error. This
	// mocks our check function succeeding on the first call.
	tick()
	mock.sendError(nil)

	// Now we tick our timer again. This time send a non-nil error, followed
	// by a nil error. This tests our retry logic, because we allow 2
	// retries, so should recover without needing to shutdown.
	tick()
	mock.sendError(errNonNil)
	mock.sendError(nil)

	// Finally, we tick our timer once more, and send two non-nil errors
	// into our error channel. This mocks our check function failing twice.
	tick()
	mock.sendError(errNonNil)
	mock.sendError(errNonNil)

	// Since we have failed within our allowed number of retries, we now
	// expect a call to our shutdown function.
	select {
	case <-shutdown:
	case <-time.After(timeout):
		t.Fatal("expected shutdown")
	}

	require.NoError(t, monitor.Stop(), "could not stop monitor")
}

// TestRetryCheck tests our retry logic. It does not include a test for exiting
// during the back off period.
func TestRetryCheck(t *testing.T) {
	tests := []struct {
		name string

		// errors provides an in-order list of errors that we expect our
		// health check to respond with. The number of errors in this
		// list indicates the number of times we expect our check to
		// be called, because our test will fail if we do not consume
		// every error.
		errors []error

		// attempts is the number of times we call a check before
		// failing.
		attempts int

		// timeout is the time we allow our check to take before we
		// fail them.
		timeout time.Duration

		// expectedShutdown is true if we expect a shutdown to be
		// triggered because all of our calls failed.
		expectedShutdown bool

		// maxAttemptsReached specifies whether the max allowed
		// attempts are reached from calling retryCheck.
		maxAttemptsReached bool
	}{
		{
			name:               "first call succeeds",
			errors:             []error{nil},
			attempts:           2,
			timeout:            time.Hour,
			expectedShutdown:   false,
			maxAttemptsReached: false,
		},
		{
			name:               "first call fails",
			errors:             []error{errNonNil},
			attempts:           1,
			timeout:            time.Hour,
			expectedShutdown:   true,
			maxAttemptsReached: true,
		},
		{
			name:               "fail then recover",
			errors:             []error{errNonNil, nil},
			attempts:           2,
			timeout:            time.Hour,
			expectedShutdown:   false,
			maxAttemptsReached: false,
		},
		{
			name:               "always fail",
			errors:             []error{errNonNil, errNonNil},
			attempts:           2,
			timeout:            time.Hour,
			expectedShutdown:   true,
			maxAttemptsReached: true,
		},
		{
			name:               "no calls",
			errors:             nil,
			attempts:           0,
			timeout:            time.Hour,
			expectedShutdown:   false,
			maxAttemptsReached: false,
		},
		{
			name:               "call times out",
			errors:             nil,
			attempts:           1,
			timeout:            1,
			expectedShutdown:   true,
			maxAttemptsReached: true,
		},
	}

	for _, test := range tests {
		test := test

		t.Run(test.name, func(t *testing.T) {
			var shutdown bool
			shutdownFunc := func(string, ...interface{}) {
				shutdown = true
			}

			mock := newMockCheck(t)

			// Create an observation that calls our call counting
			// function. We set a zero back off so that the test
			// will not wait.
			observation := &Observation{
				Check:     mock.call,
				Attempts:  test.attempts,
				Timeout:   test.timeout,
				Backoff:   0,
				OnSuccess: noOpCallback,
				OnFailure: noOpCallback,
			}
			quit := make(chan struct{})

			// Run our retry check in a goroutine because it blocks
			// on us sending errors into the mocked caller's error
			// channel.
			done := make(chan struct{})
			retryResult := false
			go func() {
				retryResult = observation.retryCheck(
					quit, shutdownFunc,
				)
				close(done)
			}()

			// Prompt our mock caller to send responses for calls
			// to our call function.
			for _, err := range test.errors {
				mock.sendError(err)
			}

			// Make sure that we have finished running our retry
			// check function before we start checking results.
			<-done

			require.Equal(t, test.maxAttemptsReached, retryResult,
				"retryCheck returned unexpected error")
			require.Equal(t, test.expectedShutdown, shutdown,
				"unexpected shutdown state")
		})
	}
}

// TestCallbacks verifies that we fire the OnSuccess/OnFailure callbacks
// as expected.
//
// - When the health check succeeds, the OnSuccess callback should fire.
// - When the failure threshold is reached, the OnFailure callback should fire.
func TestCallbacks(t *testing.T) {
	intervalTicker := ticker.NewForce(time.Hour)

	mock := newMockCheck(t)
	failureThreshold := 3

	successChan := make(chan struct{})
	failChan := make(chan struct{})
	shutdown := make(chan struct{})

	// Create our config for monitoring. We will use a 0 back off so that
	// out test does not need to wait.
	observation := &Observation{
		Check:    mock.call,
		Interval: intervalTicker,
		Attempts: failureThreshold,
		Backoff:  0,
		Timeout:  time.Hour,
		OnSuccess: func() {
			select {
			case successChan <- struct{}{}:
			case <-time.After(timeout):
				t.Fatal("unable to fire onSuccess callback")
			}
		},
		OnFailure: func() {
			close(failChan)
		},
	}

	cfg := &Config{
		Checks: []*Observation{observation},
		Shutdown: func(string, ...interface{}) {
			shutdown <- struct{}{}
		},
	}
	monitor := NewMonitor(cfg)
	require.NoError(t, monitor.Start(), "could not start monitor")

	// Tick is a helper we will use to tick our interval.
	tick := func() {
		select {
		case intervalTicker.Force <- testTime:
		case <-time.After(timeout):
			t.Fatal("could not tick timer")
		}
	}

	// We expect that the onSuccess callback is fired after each successful
	// check.
	for i := 0; i < failureThreshold; i++ {
		tick()
		mock.sendError(nil)

		// We expect that the onSuccess callback will have fired.
		select {
		case <-successChan:
		case <-time.After(timeout):
			t.Fatal("expected success callback")
		}

	}

	// Kick off another health check iteration. The monitor's internal
	// retry mechanism will re-attempt the check until it has reached
	// the configured maximum # of attempts.
	//
	// This mocks our check function failing the maximum # of times
	// consecutively that it is allowed.
	tick()
	for i := 1; i <= failureThreshold; i++ {
		mock.sendError(errNonNil)

		// Verify that the onFailure callback does not fire unless
		// the failure threshold (maximum # of attempts) is reached.
		if i < failureThreshold {
			select {
			case <-failChan:
				t.Fatal("unexpected onFailure callback")
			default:
			}
		}
	}

	// After reaching the failure threshold for this health check,
	// we expect that the onFailure callback will have fired.
	select {
	case <-failChan:
	case <-time.After(timeout):
		t.Fatal("expected onFailure callback")
	}

	// Since we have failed within our allowed number of retries, we now
	// expect a call to our shutdown function.
	select {
	case <-shutdown:
	case <-time.After(timeout):
		t.Fatal("expected shutdown")
	}
	require.NoError(t, monitor.Stop(), "could not stop monitor")
}

// TestDynamicChecks verifies that we actually kick off health check routines
// for observations that are added after starting the monitor.
func TestDynamicChecks(t *testing.T) {
	intervalTicker := ticker.NewForce(time.Hour)

	mock := newMockCheck(t)

	successChan := make(chan struct{})
	shutdown := make(chan struct{})

	// Don't configure any health checks for this monitor.
	// We'd like to verify that we can add checks after startup.
	cfg := &Config{
		Checks: []*Observation{},
		Shutdown: func(string, ...interface{}) {
			shutdown <- struct{}{}
		},
	}
	monitor := NewMonitor(cfg)
	require.NoError(t, monitor.Start(), "could not start monitor")

	// Tick is a helper we will use to tick our interval.
	tick := func() {
		select {
		case intervalTicker.Force <- testTime:
		case <-time.After(timeout):
			t.Fatal("could not tick timer")
		}
	}

	observation := &Observation{
		Check:    mock.call,
		Interval: intervalTicker,
		Attempts: 2,
		Backoff:  0,
		Timeout:  time.Hour,
		OnSuccess: func() {
			select {
			case successChan <- struct{}{}:
			case <-time.After(timeout):
				t.Fatal("unable to fire onSuccess callback")
			}
		},
		OnFailure: noOpCallback,
	}

	// Add the check after having started the monitor.
	err := monitor.AddCheck(observation)
	require.NoError(t, err, "could not add new observation")

	// This should initiate the check we dynamically added above.
	tick()

	// Verify that we can fire the OnSuccess callback.
	mock.sendError(errNonNil)
	mock.sendError(nil)
	select {
	case <-successChan:
	case <-time.After(timeout):
		t.Fatal("expected success callback")
	}

	// Verify that we correctly shutdown if the added health check fails.
	tick()
	mock.sendError(errNonNil)
	mock.sendError(errNonNil)

	// Since we have failed within our allowed number of retries, we now
	// expect a call to our shutdown function.
	select {
	case <-shutdown:
	case <-time.After(timeout):
		t.Fatal("expected shutdown")
	}
	require.NoError(t, monitor.Stop(), "could not stop monitor")
}