htlcswitch: add new LinkFailureDisconnect action

In this commit, we add a new LinkFailureDisconnect action that'll be
used if we detect that the remote party hasn't sent a revoke and ack
when it actually should.

Before this commit, we would log our action, tear down the link, but
then not actually force a connection recycle, as we assumed that if the
TCP connection was actually stale, then the read/write timeout would
expire.

In practice this doesn't always seem to be the case, so we make a strong
action here to actually force a disconnection in hopes that either side
will reconnect and keep the good times rollin' 🕺.
This commit is contained in:
Olaoluwa Osuntokun 2023-05-19 17:18:00 -07:00
parent cb5fc71659
commit 4d633f04e3
No known key found for this signature in database
GPG Key ID: 3BBD59E99B280306
5 changed files with 40 additions and 12 deletions

View File

@ -1,10 +1,10 @@
# Release Notes
## Mempool
## Mempool Optimizations
* Optimized [mempool
management](https://github.com/lightningnetwork/lnd/pull/7681) to lower the CPU
usage.
management](https://github.com/lightningnetwork/lnd/pull/7681) to lower the
CPU usage.
## Misc
@ -12,7 +12,16 @@ usage.
all macaroon DB root keys on `ChangePassword`/`GenerateNewRootKey`
respectively.
## Channel Link Bug Fix
* If we detect the remote link is inactive, [we'll now tear down the
connection](https://github.com/lightningnetwork/lnd/pull/7711) in addition to
stopping the link's statemachine. If we're persistently connected with the
peer, then this'll force a reconnect, which may restart things and help avoid
certain force close scenarios.
# Contributors (Alphabetical Order)
* Elle Mouton
* Olaoluwa Osuntokun
* Yong Yu

View File

@ -1037,7 +1037,7 @@ func (l *channelLink) htlcManager() {
l.fail(
LinkFailureError{
code: ErrSyncError,
FailureAction: LinkFailureForceClose, // nolint:lll
FailureAction: LinkFailureForceClose, //nolint:lll
},
"unable to synchronize channel "+
"states: %v", err,
@ -1239,8 +1239,13 @@ func (l *channelLink) htlcManager() {
}
case <-l.cfg.PendingCommitTicker.Ticks():
l.fail(LinkFailureError{code: ErrRemoteUnresponsive},
"unable to complete dance")
l.fail(
LinkFailureError{
code: ErrRemoteUnresponsive,
FailureAction: LinkFailureDisconnect,
},
"unable to complete dance",
)
return
// A message from the switch was just received. This indicates

View File

@ -5457,7 +5457,8 @@ func TestChannelLinkFail(t *testing.T) {
// If we expect the link to force close the channel in this
// case, check that it happens. If not, make sure it does not
// happen.
isForceCloseErr := (linkErr.FailureAction == LinkFailureForceClose)
isForceCloseErr := (linkErr.FailureAction ==
LinkFailureForceClose)
require.True(
t, test.shouldForceClose == isForceCloseErr, test.name,
)
@ -6343,11 +6344,12 @@ func TestPendingCommitTicker(t *testing.T) {
// Assert that we get the expected link failure from Alice.
select {
case linkErr := <-linkErrs:
if linkErr.code != ErrRemoteUnresponsive {
t.Fatalf("error code mismatch, "+
"want: ErrRemoteUnresponsive, got: %v",
linkErr.code)
}
require.Equal(
t, linkErr.code, ErrRemoteUnresponsive,
fmt.Sprintf("error code mismatch, want: "+
"ErrRemoteUnresponsive, got: %v", linkErr.code),
)
require.Equal(t, linkErr.FailureAction, LinkFailureDisconnect)
case <-time.After(time.Second):
t.Fatalf("did not receive failure")

View File

@ -64,6 +64,11 @@ const (
// LinkFailureForceClose indicates that the channel should be force
// closed.
LinkFailureForceClose
// LinkFailureDisconnect indicates that we should disconnect in an
// attempt to recycle the connection. This can be useful if we think a
// TCP connection or state machine is stalled.
LinkFailureDisconnect
)
// LinkFailureError encapsulates an error that will make us fail the current

View File

@ -3142,6 +3142,13 @@ func (p *Brontide) handleLinkFailure(failure linkFailureReport) {
"remote peer: %v", err)
}
}
// If the failure action is disconnect, then we'll execute that now. If
// we had to send an error above, it was a sync call, so we expect the
// message to be flushed on the wire by now.
if failure.linkErr.FailureAction == htlcswitch.LinkFailureDisconnect {
p.Disconnect(fmt.Errorf("link requested disconnect"))
}
}
// tryLinkShutdown attempts to fetch a target link from the switch, calls