lnd/itest/lnd_etcd_failover_test.go
2024-08-01 19:04:10 +02:00

342 lines
9.4 KiB
Go

//go:build kvdb_etcd
// +build kvdb_etcd
package itest
import (
"context"
"fmt"
"io"
"net"
"sync"
"testing"
"time"
"github.com/btcsuite/btcd/btcutil"
"github.com/lightningnetwork/lnd/cluster"
"github.com/lightningnetwork/lnd/kvdb"
"github.com/lightningnetwork/lnd/lncfg"
"github.com/lightningnetwork/lnd/lnrpc"
"github.com/lightningnetwork/lnd/lntest"
"github.com/lightningnetwork/lnd/lntest/port"
"github.com/stretchr/testify/require"
)
func assertLeader(ht *lntest.HarnessTest, observer cluster.LeaderElector,
expected string) {
leader, err := observer.Leader(ht.Context())
require.NoError(ht, err, "Unable to query leader")
require.Equalf(ht, expected, leader,
"Leader should be '%v', got: '%v'", expected, leader)
}
// testEtcdFailover tests that in a cluster setup where two LND nodes form a
// single cluster (sharing the same identity) one can hand over the leader role
// to the other (failing over after graceful shutdown or forceful abort).
func testEtcdFailover(ht *lntest.HarnessTest) {
testCases := []struct {
name string
kill bool
}{{
name: "failover after shutdown",
kill: false,
}, {
name: "failover after abort",
kill: true,
}}
for _, test := range testCases {
test := test
success := ht.Run(test.name, func(t1 *testing.T) {
st := ht.Subtest(t1)
testEtcdFailoverCase(st, test.kill)
})
if !success {
return
}
}
}
func testEtcdFailoverCase(ht *lntest.HarnessTest, kill bool) {
etcdCfg, cleanup, err := kvdb.StartEtcdTestBackend(
ht.T.TempDir(), uint16(port.NextAvailablePort()),
uint16(port.NextAvailablePort()), "",
)
require.NoError(ht, err, "Failed to start etcd instance")
defer cleanup()
alice := ht.NewNode("Alice", nil)
// Give Alice some coins to fund the channel.
ht.FundCoins(btcutil.SatoshiPerBitcoin, alice)
// Make leader election session TTL 5 sec to make the test run fast.
const leaderSessionTTL = 5
observer, err := cluster.MakeLeaderElector(
ht.Context(), cluster.EtcdLeaderElector, "observer",
lncfg.DefaultEtcdElectionPrefix, leaderSessionTTL, etcdCfg,
)
require.NoError(ht, err, "Cannot start election observer")
password := []byte("the quick brown fox jumps the lazy dog")
stateless := false
cluster := true
carol1, _, _ := ht.NewNodeWithSeedEtcd(
"Carol-1", etcdCfg, password, stateless, cluster,
leaderSessionTTL,
)
info1 := carol1.RPC.GetInfo()
ht.ConnectNodes(carol1, alice)
// Open a channel with 100k satoshis between Carol and Alice with Alice
// being the sole funder of the channel.
chanAmt := btcutil.Amount(100_000)
ht.OpenChannel(alice, carol1, lntest.OpenChannelParams{Amt: chanAmt})
// At this point Carol-1 is the elected leader, while Carol-2 will wait
// to become the leader when Carol-1 stops.
carol2 := ht.NewNodeEtcd(
"Carol-2", etcdCfg, password, cluster, leaderSessionTTL,
)
assertLeader(ht, observer, "Carol-1")
amt := btcutil.Amount(1000)
payReqs, _, _ := ht.CreatePayReqs(carol1, amt, 2)
ht.CompletePaymentRequests(alice, []string{payReqs[0]})
// Shut down or kill Carol-1 and wait for Carol-2 to become the leader.
failoverTimeout := time.Duration(2*leaderSessionTTL) * time.Second
if kill {
ht.KillNode(carol1)
} else {
ht.Shutdown(carol1)
}
err = carol2.WaitUntilLeader(failoverTimeout)
require.NoError(ht, err, "Waiting for Carol-2 to become the leader "+
"failed")
assertLeader(ht, observer, "Carol-2")
req := &lnrpc.UnlockWalletRequest{WalletPassword: password}
err = carol2.Unlock(req)
require.NoError(ht, err, "Unlocking Carol-2 failed")
// Make sure Carol-1 and Carol-2 have the same identity.
info2 := carol2.RPC.GetInfo()
require.Equal(ht, info1.IdentityPubkey, info2.IdentityPubkey,
"Carol-1 and Carol-2 must have the same identity")
// Make sure the nodes are connected before moving forward. Otherwise
// we may get a link not found error.
ht.AssertConnected(alice, carol2)
// Now let Alice pay the second invoice but this time we expect Carol-2
// to receive the payment.
ht.CompletePaymentRequests(alice, []string{payReqs[1]})
// Manually shutdown the node as it will mess up with our cleanup
// process.
ht.Shutdown(carol2)
}
// Proxy is a simple TCP proxy that forwards all traffic between a local and a
// remote address. We use it to simulate a network partition in the leader
// health check test.
type Proxy struct {
listenAddr string
targetAddr string
cancel context.CancelFunc
wg sync.WaitGroup
stopped chan struct{}
}
// NewProxy creates a new Proxy instance with a provided context.
func NewProxy(listenAddr, targetAddr string) *Proxy {
return &Proxy{
listenAddr: listenAddr,
targetAddr: targetAddr,
stopped: make(chan struct{}),
}
}
// Start starts the proxy. It listens on the listen address and forwards all
// traffic to the target address.
func (p *Proxy) Start(ctx context.Context, t *testing.T) {
listener, err := net.Listen("tcp", p.listenAddr)
require.NoError(t, err, "Failed to listen on %s", p.listenAddr)
t.Logf("Proxy is listening on %s", p.listenAddr)
proxyCtx, cancel := context.WithCancel(ctx)
p.cancel = cancel
p.wg.Add(1)
go func() {
defer func() {
close(p.stopped)
p.wg.Done()
}()
for {
select {
case <-proxyCtx.Done():
listener.Close()
return
default:
}
conn, err := listener.Accept()
if err != nil {
if proxyCtx.Err() != nil {
// Context is done, exit the loop
return
}
t.Logf("Proxy failed to accept connection: %v",
err)
continue
}
p.wg.Add(1)
go p.handleConnection(proxyCtx, t, conn)
}
}()
}
// handleConnection handles an accepted connection and forwards all traffic
// between the listener and target.
func (p *Proxy) handleConnection(ctx context.Context, t *testing.T,
conn net.Conn) {
targetConn, err := net.Dial("tcp", p.targetAddr)
require.NoError(t, err, "Failed to connect to target %s", p.targetAddr)
defer func() {
conn.Close()
targetConn.Close()
p.wg.Done()
}()
done := make(chan struct{})
p.wg.Add(2)
go func() {
defer p.wg.Done()
// Ignore the copy error due to the connection being closed.
_, _ = io.Copy(targetConn, conn)
}()
go func() {
defer p.wg.Done()
// Ignore the copy error due to the connection being closed.
_, _ = io.Copy(conn, targetConn)
close(done)
}()
select {
case <-ctx.Done():
case <-done:
}
}
// Stop stops the proxy and waits for all connections to be closed and all
// goroutines to be stopped.
func (p *Proxy) Stop(t *testing.T) {
require.NotNil(t, p.cancel, "Proxy is not started")
p.cancel()
p.wg.Wait()
<-p.stopped
t.Log("Proxy stopped", time.Now())
}
// testLeaderHealthCheck tests that a node is properly shut down when the leader
// health check fails.
func testLeaderHealthCheck(ht *lntest.HarnessTest) {
clientPort := port.NextAvailablePort()
// Let's start a test etcd instance that we'll redirect through a proxy.
etcdCfg, cleanup, err := kvdb.StartEtcdTestBackend(
ht.T.TempDir(), uint16(clientPort),
uint16(port.NextAvailablePort()), "",
)
require.NoError(ht, err, "Failed to start etcd instance")
// Make leader election session TTL 5 sec to make the test run fast.
const leaderSessionTTL = 5
// Create an election observer that we will use to monitor the leader
// election.
observer, err := cluster.MakeLeaderElector(
ht.Context(), cluster.EtcdLeaderElector, "observer",
lncfg.DefaultEtcdElectionPrefix, leaderSessionTTL, etcdCfg,
)
require.NoError(ht, err, "Cannot start election observer")
// Start a proxy that will forward all traffic to the etcd instance.
clientAddr := fmt.Sprintf("localhost:%d", clientPort)
proxyAddr := fmt.Sprintf("localhost:%d", port.NextAvailablePort())
ctx, cancel := context.WithCancel(ht.Context())
defer cancel()
proxy := NewProxy(proxyAddr, clientAddr)
proxy.Start(ctx, ht.T)
// Copy the etcd config so that we can modify the host to point to the
// proxy.
proxyEtcdCfg := *etcdCfg
// With the proxy in place, we can now configure the etcd client to
// connect to the proxy instead of the etcd instance.
proxyEtcdCfg.Host = "http://" + proxyAddr
defer cleanup()
// Start Carol-1 with cluster support and connect to etcd through the
// proxy.
password := []byte("the quick brown fox jumps the lazy dog")
stateless := false
cluster := true
carol, _, _ := ht.NewNodeWithSeedEtcd(
"Carol-1", &proxyEtcdCfg, password, stateless, cluster,
leaderSessionTTL,
)
// Make sure Carol-1 is indeed the leader.
assertLeader(ht, observer, "Carol-1")
// At this point Carol-1 is the elected leader, while Carol-2 will wait
// to become the leader when Carol-1 releases the lease. Note that for
// Carol-2 we don't use the proxy as we want to simulate a network
// partition only for Carol-1.
carol2 := ht.NewNodeEtcd(
"Carol-2", etcdCfg, password, cluster, leaderSessionTTL,
)
// Stop the proxy so that we simulate a network partition which
// consequently will make the leader health check fail and force Carol
// to shut down.
proxy.Stop(ht.T)
// Wait for Carol-1 to stop. If the health check wouldn't properly work
// this call would timeout and trigger a test failure.
require.NoError(ht.T, carol.WaitForProcessExit())
// Now that Carol-1 is shut down we should fail over to Carol-2.
failoverTimeout := time.Duration(2*leaderSessionTTL) * time.Second
// Make sure that Carol-2 becomes the leader (reported by Carol-2).
err = carol2.WaitUntilLeader(failoverTimeout)
require.NoError(ht, err, "Waiting for Carol-2 to become the leader "+
"failed")
// Make sure Carol-2 is indeed the leader (repoted by the observer).
assertLeader(ht, observer, "Carol-2")
}