From 3d5f33a0cf9c33c757366e9e42afa1d19348ab3d Mon Sep 17 00:00:00 2001 From: Andras Banki-Horvath Date: Mon, 21 Mar 2022 20:37:43 +0100 Subject: [PATCH 1/2] routing: fail in-flight attempts cleanly on terminal payment failure In case of a multi shard payment with more than one in-flight shards, one shard quitting with a terminal failure will stop the payment lifecycle and close the `shardHandler`'s `quit` channel. In the `collectResult` function we're waiting for the `Switch` to asynchronously return a result for each shard. This may have been interrupted by the aformentioned `quit` channel's closing skipping attempt failure (or success) notification towards the control tower and therefore skipping proper settle/fail info fill in the channel db. Since payments have a composite state of a global failure reason and settle/fail info for all attempts, any attempt with an unfilled settle/fail info keeps a payment in-flight even if the payment itself isn't in-flight anymore. --- routing/payment_lifecycle.go | 3 --- routing/router_test.go | 47 ++++++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/routing/payment_lifecycle.go b/routing/payment_lifecycle.go index 8f780fc26..f8b3cd558 100644 --- a/routing/payment_lifecycle.go +++ b/routing/payment_lifecycle.go @@ -597,9 +597,6 @@ func (p *shardHandler) collectResult(attempt *channeldb.HTLCAttemptInfo) ( case <-p.router.quit: return nil, ErrRouterShuttingDown - - case <-p.quit: - return nil, errShardHandlerExiting } // In case of a payment failure, fail the attempt with the control diff --git a/routing/router_test.go b/routing/router_test.go index 7f9997a49..06a731f55 100644 --- a/routing/router_test.go +++ b/routing/router_test.go @@ -4183,10 +4183,14 @@ func TestSendMPPaymentFailedWithShardsInFlight(t *testing.T) { // Create a buffered chan and it will be returned by GetPaymentResult. payer.resultChan = make(chan *htlcswitch.PaymentResult, 10) - // We use the failAttemptCount to track how many attempts we want to - // fail. Each time the following mock method is called, the count gets - // updated. - failAttemptCount := 0 + // We use the getPaymentResultCnt to track how many times we called + // GetPaymentResult. As shard launch is sequential, and we fail the + // first shard that calls GetPaymentResult, we may end up with different + // counts since the lifecycle itself is asynchronous. To avoid flakes + // due to this undeterminsitic behavior, we'll compare the final + // getPaymentResultCnt with other counters to create a final test + // expectation. + getPaymentResultCnt := 0 payer.On("GetPaymentResult", mock.Anything, identifier, mock.Anything, ).Run(func(args mock.Arguments) { @@ -4194,10 +4198,10 @@ func TestSendMPPaymentFailedWithShardsInFlight(t *testing.T) { // the read-only chan. // Update the counter. - failAttemptCount++ + getPaymentResultCnt++ // We fail the first attempt with terminal error. - if failAttemptCount == 1 { + if getPaymentResultCnt == 1 { payer.resultChan <- &htlcswitch.PaymentResult{ Error: htlcswitch.NewForwardingError( &lnwire.FailIncorrectDetails{}, @@ -4205,15 +4209,20 @@ func TestSendMPPaymentFailedWithShardsInFlight(t *testing.T) { ), } return - } - // For the rest attempts we will NOT send anything to the - // resultChan, thus making all the shards in active state, - // neither settled or failed. + // For the rest of the attempts we'll simulate that a network + // result update_fail_htlc has been received. This way the + // payment will fail cleanly. + payer.resultChan <- &htlcswitch.PaymentResult{ + Error: htlcswitch.NewForwardingError( + &lnwire.FailTemporaryChannelFailure{}, + 1, + ), + } }) - // Mock the FailAttempt method to fail EXACTLY once. + // Mock the FailAttempt method to fail (at least once). var failedAttempt channeldb.HTLCAttempt controlTower.On("FailAttempt", identifier, mock.Anything, mock.Anything, @@ -4223,22 +4232,27 @@ func TestSendMPPaymentFailedWithShardsInFlight(t *testing.T) { failedAttempt = payment.HTLCs[0] failedAttempt.Failure = &channeldb.HTLCFailInfo{} payment.HTLCs[0] = failedAttempt - }).Once() + }) // Setup ReportPaymentFail to return nil reason and error so the // payment won't fail. failureReason := channeldb.FailureReasonPaymentDetails + cntReportPaymentFail := 0 missionControl.On("ReportPaymentFail", mock.Anything, mock.Anything, mock.Anything, mock.Anything, ).Return(&failureReason, nil).Run(func(args mock.Arguments) { payment.FailureReason = &failureReason - }).Once() + cntReportPaymentFail++ + }) // Simple mocking the rest. - controlTower.On("Fail", identifier, failureReason).Return(nil).Once() + cntFail := 0 + controlTower.On("Fail", identifier, failureReason).Return(nil) payer.On("SendHTLC", mock.Anything, mock.Anything, mock.Anything, - ).Return(nil) + ).Return(nil).Run(func(args mock.Arguments) { + cntFail++ + }) // Call the actual method SendPayment on router. This is place inside a // goroutine so we can set a timeout for the whole test, in case @@ -4260,6 +4274,9 @@ func TestSendMPPaymentFailedWithShardsInFlight(t *testing.T) { // methods are called as expected. require.Error(t, err, "expected send payment error") require.EqualValues(t, [32]byte{}, p, "preimage not match") + require.GreaterOrEqual(t, getPaymentResultCnt, 1) + require.Equal(t, getPaymentResultCnt, cntReportPaymentFail) + require.Equal(t, getPaymentResultCnt, cntFail) controlTower.AssertExpectations(t) payer.AssertExpectations(t) From 70ac0070fabc301553809535296baa9d48a4eb3c Mon Sep 17 00:00:00 2001 From: Andras Banki-Horvath Date: Tue, 22 Mar 2022 15:41:26 +0100 Subject: [PATCH 2/2] docs: update release notes 0.15.0 --- docs/release-notes/release-notes-0.15.0.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/release-notes/release-notes-0.15.0.md b/docs/release-notes/release-notes-0.15.0.md index ed485349b..e8c3c3e08 100644 --- a/docs/release-notes/release-notes-0.15.0.md +++ b/docs/release-notes/release-notes-0.15.0.md @@ -81,6 +81,9 @@ then watch it on chain. Taproot script spends are also supported through the * [Fixed a data race in the websocket proxy code](https://github.com/lightningnetwork/lnd/pull/6380). +* [Fixed race condition resulting in MPP payments sometimes getting stuck + in-flight](https://github.com/lightningnetwork/lnd/pull/6352). + ## Misc * [An example systemd service file](https://github.com/lightningnetwork/lnd/pull/6033)