// Package healthcheck contains a monitor which takes a set of liveness checks // which it periodically checks. If a check fails after its configured number // of allowed call attempts, the monitor will send a request to shutdown using // the function is is provided in its config. Checks are dispatched in their own // goroutines so that they do not block each other. package healthcheck import ( "errors" "fmt" "sync" "sync/atomic" "time" "github.com/lightningnetwork/lnd/ticker" ) var noOpCallback = func() {} // Config contains configuration settings for our monitor. type Config struct { // Checks is a set of health checks that assert that lnd has access to // critical resources. Checks []*Observation // Shutdown should be called to request safe shutdown on failure of a // health check. Shutdown shutdownFunc } // shutdownFunc is the signature we use for a shutdown function which allows us // to print our reason for shutdown. type shutdownFunc func(format string, params ...interface{}) // Monitor periodically checks a series of configured liveness checks to // ensure that lnd has access to all critical resources. type Monitor struct { started int32 // To be used atomically. stopped int32 // To be used atomically. cfg *Config quit chan struct{} wg sync.WaitGroup } // NewMonitor returns a monitor with the provided config. func NewMonitor(cfg *Config) *Monitor { return &Monitor{ cfg: cfg, quit: make(chan struct{}), } } // Start launches the goroutines required to run our monitor. func (m *Monitor) Start() error { if !atomic.CompareAndSwapInt32(&m.started, 0, 1) { return errors.New("monitor already started") } // Run through all of the health checks that we have configured and // start a goroutine for each check. for _, check := range m.cfg.Checks { check := check // Skip over health checks that are disabled by setting zero // attempts. if check.Attempts == 0 { log.Warnf("check: %v configured with 0 attempts, "+ "skipping it", check.Name) continue } m.wg.Add(1) go func(check *Observation) { defer m.wg.Done() check.monitor(m.cfg.Shutdown, m.quit) }(check) } return nil } // Stop sends all goroutines the signal to exit and waits for them to exit. func (m *Monitor) Stop() error { if !atomic.CompareAndSwapInt32(&m.stopped, 0, 1) { return fmt.Errorf("monitor already stopped") } log.Info("Health monitor shutting down...") defer log.Debug("Health monitor shutdown complete") close(m.quit) m.wg.Wait() return nil } // AddCheck adds a new healthcheck to our monitor. func (m *Monitor) AddCheck(check *Observation) error { m.wg.Add(1) go func(check *Observation) { defer m.wg.Done() check.monitor(m.cfg.Shutdown, m.quit) }(check) return nil } // CreateCheck is a helper function that takes a function that produces an error // and wraps it in a function that returns its result on an error channel. // We do not wait group the goroutine running our checkFunc because we expect // to be dealing with health checks that may block; if we wait group them, we // may wait forever. Ideally future health checks will allow callers to cancel // them early, and we can wait group this. func CreateCheck(checkFunc func() error) func() chan error { return func() chan error { errChan := make(chan error, 1) go func() { errChan <- checkFunc() }() return errChan } } // Observation represents a liveness check that we periodically check. type Observation struct { // Name describes the health check. Name string // Check runs the health check itself, returning an error channel that // is expected to receive nil or an error. Check func() chan error // Interval is a ticker which triggers running our check function. This // ticker must be started and stopped by the observation. Interval ticker.Ticker // Attempts is the number of calls we make for a single check before // failing. Attempts int // Timeout is the amount of time we allow our check function to take // before we time it out. Timeout time.Duration // Backoff is the amount of time we back off between retries for failed // checks. Backoff time.Duration // OnSuccess is a callback which will be executed when the healthcheck // succeeds. This is optional. OnSuccess func() // OnFailure is a callback which will be executed when the healthcheck // fails. This is optional. OnFailure func() } // ObservationOption describes the signature of a functional option that can be // used to modify the behaviour of an Observation. type ObservationOption func(*Observation) // WithSuccessCallback configures an observation with a callback to be fired // whenever the health check succeeds. func WithSuccessCallback(callback func()) ObservationOption { return func(o *Observation) { o.OnSuccess = callback } } // WithFailureCallback configures an observation with a callback to be fired // whenever the health check reaches its failure threshold. func WithFailureCallback(callback func()) ObservationOption { return func(o *Observation) { o.OnFailure = callback } } // NewObservation creates an observation. func NewObservation(name string, check func() error, interval, timeout, backoff time.Duration, attempts int, opts ...ObservationOption) *Observation { observation := &Observation{ Name: name, Check: CreateCheck(check), Interval: ticker.New(interval), Attempts: attempts, Timeout: timeout, Backoff: backoff, } // Apply each option to the observation. for _, opt := range opts { opt(observation) } // Ensure that we default to NO-OP callbacks. if observation.OnSuccess == nil { observation.OnSuccess = noOpCallback } if observation.OnFailure == nil { observation.OnFailure = noOpCallback } return observation } // String returns a string representation of an observation. func (o *Observation) String() string { return o.Name } // monitor executes a health check every time its interval ticks until the quit // channel signals that we should shutdown. This function is also responsible // for starting and stopping our ticker. func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) { log.Debugf("Monitoring: %v", o) o.Interval.Resume() defer o.Interval.Stop() for { select { case <-o.Interval.Ticks(): // retryCheck will return errMaxAttemptsReached when // the max attempts are reached. In that case we will // stop the ticker and quit. if o.retryCheck(quit, shutdown) { log.Debugf("Health check: max attempts " + "failed, monitor exiting") return } // Exit if we receive the instruction to shutdown. case <-quit: log.Debug("Health check: monitor quit") return } } } // retryCheck calls a check function until it succeeds, or we reach our // configured number of attempts, waiting for our back off period between failed // calls. If we fail to obtain a passing health check after the allowed number // of calls, we will request shutdown. It returns a bool to indicate whether // the max number of attempts is reached. func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) bool { var count int for count < o.Attempts { // Increment our call count and call the health check endpoint. count++ // Wait for our check to return, timeout to elapse, or quit // signal to be received. var err error select { case err = <-o.Check(): // If our error is nil, we have passed our health check, // so we'll invoke our success callback if defined and // then exit. if err == nil { log.Debug("invoking success callback") // Invoke the success callback. o.OnSuccess() return false } case <-time.After(o.Timeout): err = fmt.Errorf("health check: %v timed out after: "+ "%v", o, o.Timeout) case <-quit: log.Debug("Health check: monitor quit") return false } // If we have reached our allowed number of attempts, this // check has failed so we'll fire the on failure callback // and request shutdown. if count == o.Attempts { log.Debug("invoking failure callback") o.OnFailure() shutdown("Health check: %v failed after %v "+ "calls", o, o.Attempts) return true } log.Infof("Health check: %v, call: %v failed with: %v, "+ "backing off for: %v", o, count, err, o.Backoff) // If we are still within the number of calls allowed for this // check, we wait for our back off period to elapse, or exit if // we get the signal to shutdown. select { case <-time.After(o.Backoff): case <-quit: log.Debug("Health check: monitor quit") return false } } return false }