mirror of
https://github.com/lightningnetwork/lnd.git
synced 2025-02-22 14:22:37 +01:00
Merge pull request #5686 from yyforyongyu/fix-health-check
healthcheck: quit checking when max attempts are reached
This commit is contained in:
commit
3c15e3d81a
18 changed files with 96 additions and 45 deletions
|
@ -118,6 +118,8 @@ func (b *BitcoindNotifier) Stop() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
chainntnfs.Log.Info("bitcoind notifier shutting down")
|
||||
|
||||
// Shutdown the rpc client, this gracefully disconnects from bitcoind,
|
||||
// and cleans up all related resources.
|
||||
b.chainConn.Stop()
|
||||
|
|
|
@ -161,6 +161,8 @@ func (b *BtcdNotifier) Stop() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
chainntnfs.Log.Info("btcd notifier shutting down")
|
||||
|
||||
// Shutdown the rpc client, this gracefully disconnects from btcd, and
|
||||
// cleans up all related resources.
|
||||
b.chainConn.Shutdown()
|
||||
|
|
|
@ -136,6 +136,8 @@ func (n *NeutrinoNotifier) Stop() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
chainntnfs.Log.Info("neutrino notifier shutting down")
|
||||
|
||||
close(n.quit)
|
||||
n.wg.Wait()
|
||||
|
||||
|
|
|
@ -97,6 +97,7 @@ func (c *ChannelNotifier) Start() error {
|
|||
func (c *ChannelNotifier) Stop() error {
|
||||
var err error
|
||||
c.stopped.Do(func() {
|
||||
log.Info("ChannelNotifier shutting down")
|
||||
err = c.ntfnServer.Stop()
|
||||
})
|
||||
return err
|
||||
|
|
|
@ -893,7 +893,7 @@ func (c *ChainArbitrator) Stop() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
log.Infof("Stopping ChainArbitrator")
|
||||
log.Info("ChainArbitrator shutting down")
|
||||
|
||||
close(c.quit)
|
||||
|
||||
|
|
|
@ -467,7 +467,10 @@ func (d *AuthenticatedGossiper) start() error {
|
|||
|
||||
// Stop signals any active goroutines for a graceful closure.
|
||||
func (d *AuthenticatedGossiper) Stop() error {
|
||||
d.stopped.Do(d.stop)
|
||||
d.stopped.Do(func() {
|
||||
log.Info("Authenticated gossiper shutting down")
|
||||
d.stop()
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -292,6 +292,8 @@ mode](https://github.com/lightningnetwork/lnd/pull/5564).
|
|||
|
||||
[Catches up on blocks in the router](https://github.com/lightningnetwork/lnd/pull/5315) in order to fix an "out of order" error that crops up.
|
||||
|
||||
[Fix healthcheck might be running after the max number of attempts are reached.](https://github.com/lightningnetwork/lnd/pull/5686)
|
||||
|
||||
## Documentation
|
||||
|
||||
The [code contribution guidelines have been updated to mention the new
|
||||
|
|
|
@ -86,6 +86,8 @@ func (m *Monitor) Stop() error {
|
|||
return fmt.Errorf("monitor already stopped")
|
||||
}
|
||||
|
||||
log.Info("Health monitor shutting down")
|
||||
|
||||
close(m.quit)
|
||||
m.wg.Wait()
|
||||
|
||||
|
@ -166,10 +168,18 @@ func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
|
|||
for {
|
||||
select {
|
||||
case <-o.Interval.Ticks():
|
||||
o.retryCheck(quit, shutdown)
|
||||
// retryCheck will return errMaxAttemptsReached when
|
||||
// the max attempts are reached. In that case we will
|
||||
// stop the ticker and quit.
|
||||
if o.retryCheck(quit, shutdown) {
|
||||
log.Debugf("Health check: max attempts " +
|
||||
"failed, monitor exiting")
|
||||
return
|
||||
}
|
||||
|
||||
// Exit if we receive the instruction to shutdown.
|
||||
case <-quit:
|
||||
log.Debug("Health check: monitor quit")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
@ -178,8 +188,11 @@ func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
|
|||
// retryCheck calls a check function until it succeeds, or we reach our
|
||||
// configured number of attempts, waiting for our back off period between failed
|
||||
// calls. If we fail to obtain a passing health check after the allowed number
|
||||
// of calls, we will request shutdown.
|
||||
func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
|
||||
// of calls, we will request shutdown. It returns a bool to indicate whether
|
||||
// the max number of attempts is reached.
|
||||
func (o *Observation) retryCheck(quit chan struct{},
|
||||
shutdown shutdownFunc) bool {
|
||||
|
||||
var count int
|
||||
|
||||
for count < o.Attempts {
|
||||
|
@ -197,13 +210,14 @@ func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
|
|||
"%v", o, o.Timeout)
|
||||
|
||||
case <-quit:
|
||||
return
|
||||
log.Debug("Health check: monitor quit")
|
||||
return false
|
||||
}
|
||||
|
||||
// If our error is nil, we have passed our health check, so we
|
||||
// can exit.
|
||||
if err == nil {
|
||||
return
|
||||
return false
|
||||
}
|
||||
|
||||
// If we have reached our allowed number of attempts, this
|
||||
|
@ -211,8 +225,7 @@ func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
|
|||
if count == o.Attempts {
|
||||
shutdown("Health check: %v failed after %v "+
|
||||
"calls", o, o.Attempts)
|
||||
|
||||
return
|
||||
return true
|
||||
}
|
||||
|
||||
log.Infof("Health check: %v, call: %v failed with: %v, "+
|
||||
|
@ -225,7 +238,10 @@ func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
|
|||
case <-time.After(o.Backoff):
|
||||
|
||||
case <-quit:
|
||||
return
|
||||
log.Debug("Health check: monitor quit")
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -132,48 +132,58 @@ func TestRetryCheck(t *testing.T) {
|
|||
// expectedShutdown is true if we expect a shutdown to be
|
||||
// triggered because all of our calls failed.
|
||||
expectedShutdown bool
|
||||
|
||||
// maxAttemptsReached specifies whether the max allowed
|
||||
// attempts are reached from calling retryCheck.
|
||||
maxAttemptsReached bool
|
||||
}{
|
||||
{
|
||||
name: "first call succeeds",
|
||||
errors: []error{nil},
|
||||
attempts: 2,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: false,
|
||||
name: "first call succeeds",
|
||||
errors: []error{nil},
|
||||
attempts: 2,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: false,
|
||||
maxAttemptsReached: false,
|
||||
},
|
||||
{
|
||||
name: "first call fails",
|
||||
errors: []error{errNonNil},
|
||||
attempts: 1,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: true,
|
||||
name: "first call fails",
|
||||
errors: []error{errNonNil},
|
||||
attempts: 1,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: true,
|
||||
maxAttemptsReached: true,
|
||||
},
|
||||
{
|
||||
name: "fail then recover",
|
||||
errors: []error{errNonNil, nil},
|
||||
attempts: 2,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: false,
|
||||
name: "fail then recover",
|
||||
errors: []error{errNonNil, nil},
|
||||
attempts: 2,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: false,
|
||||
maxAttemptsReached: false,
|
||||
},
|
||||
{
|
||||
name: "always fail",
|
||||
errors: []error{errNonNil, errNonNil},
|
||||
attempts: 2,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: true,
|
||||
name: "always fail",
|
||||
errors: []error{errNonNil, errNonNil},
|
||||
attempts: 2,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: true,
|
||||
maxAttemptsReached: true,
|
||||
},
|
||||
{
|
||||
name: "no calls",
|
||||
errors: nil,
|
||||
attempts: 0,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: false,
|
||||
name: "no calls",
|
||||
errors: nil,
|
||||
attempts: 0,
|
||||
timeout: time.Hour,
|
||||
expectedShutdown: false,
|
||||
maxAttemptsReached: false,
|
||||
},
|
||||
{
|
||||
name: "call times out",
|
||||
errors: nil,
|
||||
attempts: 1,
|
||||
timeout: 1,
|
||||
expectedShutdown: true,
|
||||
name: "call times out",
|
||||
errors: nil,
|
||||
attempts: 1,
|
||||
timeout: 1,
|
||||
expectedShutdown: true,
|
||||
maxAttemptsReached: true,
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -203,8 +213,11 @@ func TestRetryCheck(t *testing.T) {
|
|||
// on us sending errors into the mocked caller's error
|
||||
// channel.
|
||||
done := make(chan struct{})
|
||||
retryResult := false
|
||||
go func() {
|
||||
observation.retryCheck(quit, shutdownFunc)
|
||||
retryResult = observation.retryCheck(
|
||||
quit, shutdownFunc,
|
||||
)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
|
@ -218,6 +231,8 @@ func TestRetryCheck(t *testing.T) {
|
|||
// check function before we start checking results.
|
||||
<-done
|
||||
|
||||
require.Equal(t, test.maxAttemptsReached, retryResult,
|
||||
"retryCheck returned unexpected error")
|
||||
require.Equal(t, test.expectedShutdown, shutdown,
|
||||
"unexpected shutdown state")
|
||||
})
|
||||
|
|
|
@ -137,6 +137,9 @@ func (p *OnionProcessor) Start() error {
|
|||
|
||||
// Stop shutsdown the onion processor's sphinx router.
|
||||
func (p *OnionProcessor) Stop() error {
|
||||
|
||||
log.Info("Onion processor shutting down")
|
||||
|
||||
p.router.Stop()
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -91,6 +91,7 @@ func (h *HtlcNotifier) Start() error {
|
|||
func (h *HtlcNotifier) Stop() error {
|
||||
var err error
|
||||
h.stopped.Do(func() {
|
||||
log.Info("HtlcNotifier shutting down")
|
||||
if err = h.ntfnServer.Stop(); err != nil {
|
||||
log.Warnf("error stopping htlc notifier: %v", err)
|
||||
}
|
||||
|
|
|
@ -1942,7 +1942,7 @@ func (s *Switch) Stop() error {
|
|||
return errors.New("htlc switch already shutdown")
|
||||
}
|
||||
|
||||
log.Infof("HTLC Switch shutting down")
|
||||
log.Info("HTLC Switch shutting down")
|
||||
|
||||
close(s.quit)
|
||||
|
||||
|
|
|
@ -257,6 +257,8 @@ func (i *InvoiceRegistry) Start() error {
|
|||
|
||||
// Stop signals the registry for a graceful shutdown.
|
||||
func (i *InvoiceRegistry) Stop() error {
|
||||
log.Info("InvoiceRegistry shutting down")
|
||||
|
||||
i.expiryWatcher.Stop()
|
||||
|
||||
close(i.quit)
|
||||
|
|
|
@ -217,6 +217,7 @@ func (m *ChanStatusManager) start() error {
|
|||
// Stop safely shuts down the ChanStatusManager.
|
||||
func (m *ChanStatusManager) Stop() error {
|
||||
m.stopped.Do(func() {
|
||||
log.Info("Channel Status Manager shutting down")
|
||||
close(m.quit)
|
||||
m.wg.Wait()
|
||||
})
|
||||
|
|
|
@ -68,6 +68,7 @@ func (h *HostAnnouncer) Start() error {
|
|||
// Stop signals the HostAnnouncer for a graceful stop.
|
||||
func (h *HostAnnouncer) Stop() error {
|
||||
h.stopOnce.Do(func() {
|
||||
log.Info("HostAnnouncer shutting down")
|
||||
close(h.quit)
|
||||
h.wg.Wait()
|
||||
})
|
||||
|
|
|
@ -52,7 +52,7 @@ func (p *PeerNotifier) Start() error {
|
|||
func (p *PeerNotifier) Stop() error {
|
||||
var err error
|
||||
p.stopped.Do(func() {
|
||||
log.Info("Stopping PeerNotifier")
|
||||
log.Info("PeerNotifier shutting down")
|
||||
err = p.ntfnServer.Stop()
|
||||
})
|
||||
return err
|
||||
|
|
|
@ -691,7 +691,7 @@ func (r *ChannelRouter) Stop() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
log.Tracef("Channel Router shutting down")
|
||||
log.Info("Channel Router shutting down")
|
||||
|
||||
// Our filtered chain view could've only been started if
|
||||
// AssumeChannelValid isn't present.
|
||||
|
|
|
@ -427,7 +427,7 @@ func (s *UtxoSweeper) Stop() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
log.Debugf("Sweeper shutting down")
|
||||
log.Info("Sweeper shutting down")
|
||||
|
||||
close(s.quit)
|
||||
s.wg.Wait()
|
||||
|
|
Loading…
Add table
Reference in a new issue