mirror of
https://github.com/lightningnetwork/lnd.git
synced 2025-01-18 13:27:56 +01:00
routing: fail stale HTLC attempts during startup
This commit adds a check during router's startup and fails the inflight HTLCs if they are routing using channels unknown to us. The channels are unknown because they are already closed, usually long time ago.
This commit is contained in:
parent
b998ce11f1
commit
c2f7e6a66f
@ -285,6 +285,13 @@ type Config struct {
|
||||
// ApplyChannelUpdate can be called to apply a new channel update to the
|
||||
// graph that we received from a payment failure.
|
||||
ApplyChannelUpdate func(msg *lnwire.ChannelUpdate) bool
|
||||
|
||||
// FetchClosedChannels is used by the router to fetch closed channels.
|
||||
//
|
||||
// TODO(yy): remove this method once the root cause of stuck payments
|
||||
// is found.
|
||||
FetchClosedChannels func(pendingOnly bool) (
|
||||
[]*channeldb.ChannelCloseSummary, error)
|
||||
}
|
||||
|
||||
// EdgeLocator is a struct used to identify a specific edge.
|
||||
@ -1384,6 +1391,19 @@ func (r *ChannelRouter) BuildRoute(amt fn.Option[lnwire.MilliSatoshi],
|
||||
// resumePayments fetches inflight payments and resumes their payment
|
||||
// lifecycles.
|
||||
func (r *ChannelRouter) resumePayments() error {
|
||||
// Get a list of closed channels.
|
||||
channels, err := r.cfg.FetchClosedChannels(false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
closedSCIDs := make(map[lnwire.ShortChannelID]struct{}, len(channels))
|
||||
for _, c := range channels {
|
||||
if !c.IsPending {
|
||||
closedSCIDs[c.ShortChanID] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
// Get all payments that are inflight.
|
||||
payments, err := r.cfg.Control.FetchInFlightPayments()
|
||||
if err != nil {
|
||||
@ -1399,6 +1419,12 @@ func (r *ChannelRouter) resumePayments() error {
|
||||
for _, p := range payments {
|
||||
for _, a := range p.HTLCs {
|
||||
toKeep[a.AttemptID] = struct{}{}
|
||||
|
||||
// Try to fail the attempt if the route contains a dead
|
||||
// channel.
|
||||
r.failStaleAttempt(
|
||||
a, p.Info.PaymentIdentifier, closedSCIDs,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@ -1473,6 +1499,108 @@ func (r *ChannelRouter) resumePayments() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// failStaleAttempt will fail an HTLC attempt if it's using an unknown channel
|
||||
// in its route. It first consults the switch to see if there's already a
|
||||
// network result stored for this attempt. If not, it will check whether the
|
||||
// first hop of this attempt is using an active channel known to us. If
|
||||
// inactive, this attempt will be failed.
|
||||
//
|
||||
// NOTE: there's an unknown bug that caused the network result for a particular
|
||||
// attempt to NOT be saved, resulting a payment being stuck forever. More info:
|
||||
// - https://github.com/lightningnetwork/lnd/issues/8146
|
||||
// - https://github.com/lightningnetwork/lnd/pull/8174
|
||||
func (r *ChannelRouter) failStaleAttempt(a channeldb.HTLCAttempt,
|
||||
payHash lntypes.Hash, closedSCIDs map[lnwire.ShortChannelID]struct{}) {
|
||||
|
||||
// We can only fail inflight HTLCs so we skip the settled/failed ones.
|
||||
if a.Failure != nil || a.Settle != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// First, check if we've already had a network result for this attempt.
|
||||
// If no result is found, we'll check whether the reference link is
|
||||
// still known to us.
|
||||
ok, err := r.cfg.Payer.HasAttemptResult(a.AttemptID)
|
||||
if err != nil {
|
||||
log.Errorf("Failed to fetch network result for attempt=%v",
|
||||
a.AttemptID)
|
||||
return
|
||||
}
|
||||
|
||||
// There's already a network result, no need to fail it here as the
|
||||
// payment lifecycle will take care of it, so we can exit early.
|
||||
if ok {
|
||||
log.Debugf("Already have network result for attempt=%v",
|
||||
a.AttemptID)
|
||||
return
|
||||
}
|
||||
|
||||
// We now need to decide whether this attempt should be failed here.
|
||||
// For very old payments, it's possible that the network results were
|
||||
// never saved, causing the payments to be stuck inflight. We now check
|
||||
// whether the first hop is referencing an active channel ID and, if
|
||||
// not, we will fail the attempt as it has no way to be retried again.
|
||||
var shouldFail bool
|
||||
|
||||
// Validate that the attempt has hop info. If this attempt has no hop
|
||||
// info it indicates an error in our db.
|
||||
if len(a.Route.Hops) == 0 {
|
||||
log.Errorf("Found empty hop for attempt=%v", a.AttemptID)
|
||||
|
||||
shouldFail = true
|
||||
} else {
|
||||
// Get the short channel ID.
|
||||
chanID := a.Route.Hops[0].ChannelID
|
||||
scid := lnwire.NewShortChanIDFromInt(chanID)
|
||||
|
||||
// Check whether this link is active. If so, we won't fail the
|
||||
// attempt but keep waiting for its result.
|
||||
_, err := r.cfg.GetLink(scid)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// We should get the link not found err. If not, we will log an
|
||||
// error and skip failing this attempt since an unknown error
|
||||
// occurred.
|
||||
if !errors.Is(err, htlcswitch.ErrChannelLinkNotFound) {
|
||||
log.Errorf("Failed to get link for attempt=%v for "+
|
||||
"payment=%v: %v", a.AttemptID, payHash, err)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// The channel link is not active, we now check whether this
|
||||
// channel is already closed. If so, we fail it as there's no
|
||||
// need to wait for the network result because it won't be
|
||||
// re-sent. If the channel is still pending, we'll keep waiting
|
||||
// for the result as we may get a contract resolution for this
|
||||
// HTLC.
|
||||
if _, ok := closedSCIDs[scid]; ok {
|
||||
shouldFail = true
|
||||
}
|
||||
}
|
||||
|
||||
// Exit if there's no need to fail.
|
||||
if !shouldFail {
|
||||
return
|
||||
}
|
||||
|
||||
log.Errorf("Failing stale attempt=%v for payment=%v", a.AttemptID,
|
||||
payHash)
|
||||
|
||||
// Fail the attempt in db. If there's an error, there's nothing we can
|
||||
// do here but logging it.
|
||||
failInfo := &channeldb.HTLCFailInfo{
|
||||
Reason: channeldb.HTLCFailUnknown,
|
||||
FailTime: r.cfg.Clock.Now(),
|
||||
}
|
||||
_, err = r.cfg.Control.FailAttempt(payHash, a.AttemptID, failInfo)
|
||||
if err != nil {
|
||||
log.Errorf("Fail attempt=%v got error: %v", a.AttemptID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// getEdgeUnifiers returns a list of edge unifiers for the given route.
|
||||
func getEdgeUnifiers(source route.Vertex, hops []route.Vertex,
|
||||
outgoingChans map[uint64]struct{},
|
||||
|
@ -93,6 +93,10 @@ func (c *testCtx) getChannelIDFromAlias(t *testing.T, a, b string) uint64 {
|
||||
return channelID
|
||||
}
|
||||
|
||||
func mockFetchClosedChannels(_ bool) ([]*channeldb.ChannelCloseSummary, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func createTestCtxFromGraphInstance(t *testing.T, startingHeight uint32,
|
||||
graphInstance *testGraphInstance) *testCtx {
|
||||
|
||||
@ -158,9 +162,10 @@ func createTestCtxFromGraphInstanceAssumeValid(t *testing.T,
|
||||
next := atomic.AddUint64(&uniquePaymentID, 1)
|
||||
return next, nil
|
||||
},
|
||||
PathFindingConfig: pathFindingConfig,
|
||||
Clock: clock.NewTestClock(time.Unix(1, 0)),
|
||||
ApplyChannelUpdate: graphBuilder.ApplyChannelUpdate,
|
||||
PathFindingConfig: pathFindingConfig,
|
||||
Clock: clock.NewTestClock(time.Unix(1, 0)),
|
||||
ApplyChannelUpdate: graphBuilder.ApplyChannelUpdate,
|
||||
FetchClosedChannels: mockFetchClosedChannels,
|
||||
})
|
||||
require.NoError(t, router.Start(), "unable to start router")
|
||||
|
||||
@ -2170,6 +2175,7 @@ func TestSendToRouteSkipTempErrSuccess(t *testing.T) {
|
||||
NextPaymentID: func() (uint64, error) {
|
||||
return 0, nil
|
||||
},
|
||||
FetchClosedChannels: mockFetchClosedChannels,
|
||||
}}
|
||||
|
||||
// Register mockers with the expected method calls.
|
||||
@ -2253,6 +2259,7 @@ func TestSendToRouteSkipTempErrNonMPP(t *testing.T) {
|
||||
NextPaymentID: func() (uint64, error) {
|
||||
return 0, nil
|
||||
},
|
||||
FetchClosedChannels: mockFetchClosedChannels,
|
||||
}}
|
||||
|
||||
// Expect an error to be returned.
|
||||
@ -2307,6 +2314,7 @@ func TestSendToRouteSkipTempErrTempFailure(t *testing.T) {
|
||||
NextPaymentID: func() (uint64, error) {
|
||||
return 0, nil
|
||||
},
|
||||
FetchClosedChannels: mockFetchClosedChannels,
|
||||
}}
|
||||
|
||||
// Create the error to be returned.
|
||||
@ -2389,6 +2397,7 @@ func TestSendToRouteSkipTempErrPermanentFailure(t *testing.T) {
|
||||
NextPaymentID: func() (uint64, error) {
|
||||
return 0, nil
|
||||
},
|
||||
FetchClosedChannels: mockFetchClosedChannels,
|
||||
}}
|
||||
|
||||
// Create the error to be returned.
|
||||
@ -2475,6 +2484,7 @@ func TestSendToRouteTempFailure(t *testing.T) {
|
||||
NextPaymentID: func() (uint64, error) {
|
||||
return 0, nil
|
||||
},
|
||||
FetchClosedChannels: mockFetchClosedChannels,
|
||||
}}
|
||||
|
||||
// Create the error to be returned.
|
||||
|
25
server.go
25
server.go
@ -993,18 +993,19 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
|
||||
}
|
||||
|
||||
s.chanRouter, err = routing.New(routing.Config{
|
||||
SelfNode: selfNode.PubKeyBytes,
|
||||
RoutingGraph: graphsession.NewRoutingGraph(chanGraph),
|
||||
Chain: cc.ChainIO,
|
||||
Payer: s.htlcSwitch,
|
||||
Control: s.controlTower,
|
||||
MissionControl: s.missionControl,
|
||||
SessionSource: paymentSessionSource,
|
||||
GetLink: s.htlcSwitch.GetLinkByShortID,
|
||||
NextPaymentID: sequencer.NextID,
|
||||
PathFindingConfig: pathFindingConfig,
|
||||
Clock: clock.NewDefaultClock(),
|
||||
ApplyChannelUpdate: s.graphBuilder.ApplyChannelUpdate,
|
||||
SelfNode: selfNode.PubKeyBytes,
|
||||
RoutingGraph: graphsession.NewRoutingGraph(chanGraph),
|
||||
Chain: cc.ChainIO,
|
||||
Payer: s.htlcSwitch,
|
||||
Control: s.controlTower,
|
||||
MissionControl: s.missionControl,
|
||||
SessionSource: paymentSessionSource,
|
||||
GetLink: s.htlcSwitch.GetLinkByShortID,
|
||||
NextPaymentID: sequencer.NextID,
|
||||
PathFindingConfig: pathFindingConfig,
|
||||
Clock: clock.NewDefaultClock(),
|
||||
ApplyChannelUpdate: s.graphBuilder.ApplyChannelUpdate,
|
||||
FetchClosedChannels: s.chanStateDB.FetchClosedChannels,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("can't create router: %w", err)
|
||||
|
Loading…
Reference in New Issue
Block a user