watchtower/wtclient: generic disk overflow queue

In this commit, a new generic DiskOverflowQueue implementation is added. This allows a user to specify a maximum number of items that the queue can hold in-memory. Any new items will then overflow to disk. The producer and consumer of the queue items will interact with the queue just like a normal in-memory queue.
2025-01-18 13:27:56 +01:00 · 2023-02-09 12:40:52 +02:00 · 2023-02-09 12:40:52 +02:00 · e91fe50878
commit e91fe50878
parent 66f6bf3955
3 changed files with 1005 additions and 0 deletions
--- a/watchtower/wtclient/client.go
+++ b/watchtower/wtclient/client.go
@ -51,6 +51,10 @@ const (
 	// random number of blocks to delay closing a session after its last
 	// channel has been closed.
 	DefaultSessionCloseRange = 288
+
+	// DefaultMaxTasksInMemQueue is the maximum number of items to be held
+	// in the in-memory queue.
+	DefaultMaxTasksInMemQueue = 2000
 )

 // genSessionFilter constructs a filter that can be used to select sessions only
--- a/watchtower/wtclient/queue.go
+++ b/watchtower/wtclient/queue.go
@ -0,0 +1,566 @@
+package wtclient
+
+import (
+	"container/list"
+	"errors"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/btcsuite/btclog"
+	"github.com/lightningnetwork/lnd/watchtower/wtdb"
+)
+
+const (
+	// dbErrorBackoff is the length of time we will back off before retrying
+	// any DB action that failed.
+	dbErrorBackoff = time.Second * 5
+)
+
+// internalTask wraps a BackupID task with a success channel.
+type internalTask[T any] struct {
+	task    T
+	success chan bool
+}
+
+// newInternalTask creates a new internalTask with the given task.
+func newInternalTask[T any](task T) *internalTask[T] {
+	return &internalTask[T]{
+		task:    task,
+		success: make(chan bool),
+	}
+}
+
+// DiskOverflowQueue is a queue that must be initialised with a certain maximum
+// buffer size which represents the maximum number of elements that the queue
+// should hold in memory. If the queue is full, then any new elements added to
+// the queue will be persisted to disk instead. Once a consumer starts reading
+// from the front of the queue again then items on disk will be moved into the
+// queue again. The queue is also re-start safe. When it is stopped, any items
+// in the memory queue, will be persisted to disk. On start up, the queue will
+// be re-initialised with the items on disk.
+type DiskOverflowQueue[T any] struct {
+	startOnce sync.Once
+	stopOnce  sync.Once
+
+	log btclog.Logger
+
+	// db is the database that will be used to persist queue items to disk.
+	db wtdb.Queue[T]
+
+	// toDisk represents the current mode of operation of the queue.
+	toDisk atomic.Bool
+
+	// We used an unbound list for the input of the queue so that producers
+	// putting items into the queue are never blocked.
+	inputListMu   sync.Mutex
+	inputListCond *sync.Cond
+	inputList     *list.List
+
+	// inputChan is an unbuffered channel used to pass items from
+	// drainInputList to feedMemQueue.
+	inputChan chan *internalTask[T]
+
+	// memQueue is a buffered channel used to pass items from
+	// feedMemQueue to feedOutputChan.
+	memQueue chan T
+
+	// outputChan is an unbuffered channel from which items at the head of
+	// the queue can be read.
+	outputChan chan T
+
+	// newDiskItemSignal is used to signal that there is a new item in the
+	// main disk queue. There should only be one reader and one writer for
+	// this channel.
+	newDiskItemSignal chan struct{}
+
+	// leftOverItem1 will be a non-nil task on shutdown if the
+	// feedOutputChan method was holding an unhandled tasks at shutdown
+	// time. Since feedOutputChan handles the very head of the queue, this
+	// item should be the first to be reloaded on restart.
+	leftOverItem1 *T
+
+	// leftOverItems2 will be non-empty on shutdown if the feedMemQueue
+	// method was holding any unhandled tasks at shutdown time. Since
+	// feedMemQueue manages the input to the queue, the tasks should be
+	// pushed to the head of the disk queue.
+	leftOverItems2 []T
+
+	// leftOverItem3 will be non-nil on shutdown if drainInputList was
+	// holding an unhandled task at shutdown time. This task should be put
+	// at the tail of the disk queue but should come before any input list
+	// task.
+	leftOverItem3 *T
+
+	quit chan struct{}
+	wg   sync.WaitGroup
+}
+
+// NewDiskOverflowQueue constructs a new DiskOverflowQueue.
+func NewDiskOverflowQueue[T any](db wtdb.Queue[T], maxQueueSize uint64,
+	logger btclog.Logger) (*DiskOverflowQueue[T], error) {
+
+	if maxQueueSize < 2 {
+		return nil, errors.New("the in-memory queue buffer size " +
+			"must be larger than 2")
+	}
+
+	q := &DiskOverflowQueue[T]{
+		log:               logger,
+		db:                db,
+		inputList:         list.New(),
+		newDiskItemSignal: make(chan struct{}, 1),
+		inputChan:         make(chan *internalTask[T]),
+		memQueue:          make(chan T, maxQueueSize-2),
+		outputChan:        make(chan T),
+		quit:              make(chan struct{}),
+	}
+	q.inputListCond = sync.NewCond(&q.inputListMu)
+
+	return q, nil
+}
+
+// Start kicks off all the goroutines that are required to manage the queue.
+func (q *DiskOverflowQueue[T]) Start() error {
+	var err error
+	q.startOnce.Do(func() {
+		err = q.start()
+	})
+
+	return err
+}
+
+// start kicks off all the goroutines that are required to manage the queue.
+func (q *DiskOverflowQueue[T]) start() error {
+	numDisk, err := q.db.Len()
+	if err != nil {
+		return err
+	}
+	if numDisk != 0 {
+		q.toDisk.Store(true)
+	}
+
+	// Kick off the three goroutines which will handle the input list, the
+	// in-memory queue and the output channel.
+	// The three goroutines are moving items according to the following
+	// diagram:
+	//
+	// 	┌─────────┐ drainInputList  ┌──────────┐
+	// 	│inputList├─────┬──────────►│disk/db   │
+	// 	└─────────┘     │           └──────────┘
+	// 	                │ (depending on mode)
+	// 	                │           ┌──────────┐
+	// 	                └──────────►│inputChan │
+	// 	                            └──────────┘
+	//
+	// 	┌─────────┐ feedMemQueue    ┌──────────┐
+	// 	│disk/db  ├───────┬────────►│memQueue  │
+	// 	└─────────┘       │         └──────────┘
+	// 	                  │ (depending on mode)
+	// 	┌─────────┐       │
+	// 	│inputChan├───────┘
+	// 	└─────────┘
+	//
+	// 	┌─────────┐ feedOutputChan  ┌──────────┐
+	// 	│memQueue ├────────────────►│outputChan│
+	// 	└─────────┘                 └──────────┘
+	//
+	q.wg.Add(3)
+	go q.drainInputList()
+	go q.feedMemQueue()
+	go q.feedOutputChan()
+
+	return nil
+}
+
+// Stop stops the queue and persists any items in the memory queue to disk.
+func (q *DiskOverflowQueue[T]) Stop() error {
+	var err error
+	q.stopOnce.Do(func() {
+		err = q.stop()
+	})
+
+	return err
+}
+
+// stop the queue and persists any items in the memory queue to disk.
+func (q *DiskOverflowQueue[T]) stop() error {
+	close(q.quit)
+
+	// Signal on the inputListCond until all the goroutines have returned.
+	shutdown := make(chan struct{})
+	go func() {
+		for {
+			select {
+			case <-time.After(time.Millisecond):
+				q.inputListCond.Signal()
+			case <-shutdown:
+				return
+			}
+		}
+	}()
+
+	q.wg.Wait()
+	close(shutdown)
+
+	// queueHead will be the items that we will be pushed to the head of
+	// the queue.
+	var queueHead []T
+
+	// First, we append leftOverItem1 since this task is the current head
+	// of the queue.
+	if q.leftOverItem1 != nil {
+		queueHead = append(queueHead, *q.leftOverItem1)
+	}
+
+	// Next, drain the buffered queue.
+	for {
+		task, ok := <-q.memQueue
+		if !ok {
+			break
+		}
+
+		queueHead = append(queueHead, task)
+	}
+
+	// Then, any items held in leftOverItems2 would have been next to join
+	// the memQueue. So those gets added next.
+	if len(q.leftOverItems2) != 0 {
+		queueHead = append(queueHead, q.leftOverItems2...)
+	}
+
+	// Now, push these items to the head of the queue.
+	err := q.db.PushHead(queueHead...)
+	if err != nil {
+		q.log.Errorf("Could not add tasks to queue head: %v", err)
+	}
+
+	// Next we handle any items that need to be added to the main disk
+	// queue.
+	var diskQueue []T
+
+	// Any item in leftOverItem3 is the first item that should join the
+	// disk queue.
+	if q.leftOverItem3 != nil {
+		diskQueue = append(diskQueue, *q.leftOverItem3)
+	}
+
+	// Lastly, drain any items in the unbuffered input list.
+	q.inputListCond.L.Lock()
+	for q.inputList.Front() != nil {
+		e := q.inputList.Front()
+
+		//nolint:forcetypeassert
+		task := q.inputList.Remove(e).(T)
+
+		diskQueue = append(diskQueue, task)
+	}
+	q.inputListCond.L.Unlock()
+
+	// Now persist these items to the main disk queue.
+	err = q.db.Push(diskQueue...)
+	if err != nil {
+		q.log.Errorf("Could not add tasks to queue tail: %v", err)
+	}
+
+	return nil
+}
+
+// QueueBackupID adds a wtdb.BackupID to the queue. It will only return an error
+// if the queue has been stopped. It is non-blocking.
+func (q *DiskOverflowQueue[T]) QueueBackupID(item *wtdb.BackupID) error {
+	// Return an error if the queue has been stopped
+	select {
+	case <-q.quit:
+		return ErrClientExiting
+	default:
+	}
+
+	// Add the new item to the unbound input list.
+	q.inputListCond.L.Lock()
+	q.inputList.PushBack(item)
+	q.inputListCond.L.Unlock()
+
+	// Signal that there is a new item in the input list.
+	q.inputListCond.Signal()
+
+	return nil
+}
+
+// NextBackupID can be used to read from the head of the DiskOverflowQueue.
+func (q *DiskOverflowQueue[T]) NextBackupID() <-chan T {
+	return q.outputChan
+}
+
+// drainInputList handles the input to the DiskOverflowQueue. It takes from the
+// un-bounded input list and then, depending on what mode the queue is in,
+// either puts the new item straight onto the persisted disk queue or attempts
+// to feed it into the memQueue. On exit, any unhandled task will be assigned to
+// leftOverItem3.
+func (q *DiskOverflowQueue[T]) drainInputList() {
+	defer q.wg.Done()
+
+	for {
+		// Wait for the input list to not be empty.
+		q.inputListCond.L.Lock()
+		for q.inputList.Front() == nil {
+			q.inputListCond.Wait()
+
+			select {
+			case <-q.quit:
+				q.inputListCond.L.Unlock()
+				return
+			default:
+			}
+		}
+
+		// Pop the first element from the queue.
+		e := q.inputList.Front()
+
+		//nolint:forcetypeassert
+		task := q.inputList.Remove(e).(T)
+		q.inputListCond.L.Unlock()
+
+		// What we do with this new item depends on what the mode of the
+		// queue currently is.
+		for q.pushToActiveQueue(task) {
+		}
+
+		// If the above returned false because the quit channel was
+		// closed, then we exit.
+		select {
+		case <-q.quit:
+			return
+		default:
+		}
+	}
+}
+
+// pushToActiveQueue handles the input of a new task to the queue. It returns
+// true if the task should be retried and false if the task was handled or the
+// quit channel fired.
+func (q *DiskOverflowQueue[T]) pushToActiveQueue(task T) bool {
+	// If the queue is in disk mode then any new items should be put
+	// straight into the disk queue.
+	if q.toDisk.Load() {
+		err := q.db.Push(task)
+		if err != nil {
+			// Log and back off for a few seconds and then
+			// try again with the same task.
+			q.log.Errorf("could not persist %s to disk. "+
+				"Retrying after backoff", task)
+
+			select {
+			// Backoff for a bit and then re-check the mode
+			// and try again to handle the task.
+			case <-time.After(dbErrorBackoff):
+				return true
+
+			// If the queue is quit at this moment, then the
+			// unhandled task is assigned to leftOverItem3
+			// so that it can be handled by the stop method.
+			case <-q.quit:
+				q.leftOverItem3 = &task
+
+				return false
+			}
+		}
+
+		// Send a signal that there is a new item in the main
+		// disk queue.
+		select {
+		case q.newDiskItemSignal <- struct{}{}:
+		case <-q.quit:
+
+		// Because there might already be a signal in the
+		// newDiskItemSignal channel, we can skip sending another
+		// signal. The channel only has a buffer of one, so we would
+		// block here if we didn't have a default case.
+		default:
+		}
+
+		// If we got here, we were able to store the task in the disk
+		// queue, so we can return false as no retry is necessary.
+		return false
+	}
+
+	// If the mode is memory mode, then try feed it to the feedMemQueue
+	// handler via the un-buffered inputChan channel. We wrap it in an
+	// internal task so that we can find out if feedMemQueue successfully
+	// handled the item. If it did, we continue in memory mode and if not,
+	// then we switch to disk mode so that we can persist the item to the
+	// disk queue instead.
+	it := newInternalTask(task)
+
+	select {
+	// Try feed the task to the feedMemQueue handler. The handler, if it
+	// does take the task, is guaranteed to respond via the success channel
+	// of the task to indicate if the task was successfully added to the
+	// in-mem queue. This is guaranteed even if the queue is being stopped.
+	case q.inputChan <- it:
+
+	// If the queue is quit at this moment, then the unhandled task is
+	// assigned to leftOverItem3 so that it can be handled by the stop
+	// method.
+	case <-q.quit:
+		q.leftOverItem3 = &task
+
+		return false
+
+	default:
+		// The task was not accepted. So maybe the mode changed.
+		return true
+	}
+
+	// If we get here, it means that the feedMemQueue handler took the task.
+	// It is guaranteed to respond via the success channel, so we wait for
+	// that response here.
+	s := <-it.success
+	if s {
+		return false
+	}
+
+	// If the task was not successfully handled by feedMemQueue, then we
+	// switch to disk mode so that the task can be persisted in the disk
+	// queue instead.
+	q.toDisk.Store(true)
+
+	return true
+}
+
+// feedMemQueue manages which items should be fed onto the buffered
+// memQueue. If the queue is then in disk mode, then the handler will read new
+// tasks from the disk queue until it is empty. After that, it will switch
+// between reading from the input channel or the disk queue depending on the
+// queue mode.
+func (q *DiskOverflowQueue[T]) feedMemQueue() {
+	defer func() {
+		close(q.memQueue)
+		q.wg.Done()
+	}()
+
+	feedFromDisk := func() {
+		select {
+		case <-q.quit:
+			return
+		default:
+		}
+
+		for {
+			// Ideally, we want to do batch reads from the DB. So
+			// we check how much capacity there is in the memQueue
+			// and fetch enough tasks to fill that capacity. If
+			// there is no capacity, however, then we at least want
+			// to fetch one task.
+			numToPop := cap(q.memQueue) - len(q.memQueue)
+			if numToPop == 0 {
+				numToPop = 1
+			}
+
+			tasks, err := q.db.PopUpTo(numToPop)
+			if errors.Is(err, wtdb.ErrEmptyQueue) {
+				q.toDisk.Store(false)
+
+				return
+			} else if err != nil {
+				q.log.Errorf("Could not load next task from " +
+					"disk. Retrying.")
+
+				select {
+				case <-time.After(dbErrorBackoff):
+					continue
+				case <-q.quit:
+					return
+				}
+			}
+
+			for i, task := range tasks {
+				select {
+				case q.memQueue <- task:
+
+				// If the queue is quit at this moment, then the
+				// unhandled tasks are assigned to
+				// leftOverItems2 so that they can be handled
+				// by the stop method.
+				case <-q.quit:
+					q.leftOverItems2 = tasks[i:]
+					return
+				}
+			}
+		}
+	}
+
+	// If the queue is in disk mode, then the memQueue is fed with tasks
+	// from the disk queue until it is empty.
+	if q.toDisk.Load() {
+		feedFromDisk()
+	}
+
+	// Now the queue enters its normal operation.
+	for {
+		select {
+		case <-q.quit:
+			return
+
+		// If there is a signal that a new item has been added to disk
+		// then we use the disk queue as the source of the next task
+		// to feed into memQueue.
+		case <-q.newDiskItemSignal:
+			feedFromDisk()
+
+		// If any items come through on the inputChan, then we try feed
+		// these directly into the memQueue. If there is space in the
+		// memeQueue then we respond with success to the producer,
+		// otherwise we respond with failure so that the producer can
+		// instead persist the task to disk. After the producer,
+		// drainInputList, has pushed an item to inputChan, it is
+		// guaranteed to await a response on the task's success channel
+		// before quiting. Therefore, it is not required to listen on
+		// the quit channel here.
+		case task := <-q.inputChan:
+			select {
+			case q.memQueue <- task.task:
+				task.success <- true
+				continue
+			default:
+				task.success <- false
+			}
+		}
+	}
+}
+
+// feedOutputChan will pop an item from the buffered memQueue and block until
+// the item is taken from the un-buffered outputChan. This is done repeatedly
+// for the lifetime of the DiskOverflowQueue. On shutdown of the queue, any
+// item not consumed by the outputChan but held by this method is assigned to
+// the leftOverItem1 member so that the Stop method can persist the item to
+// disk so that it is reloaded on restart.
+//
+// NOTE: This must be run as a goroutine.
+func (q *DiskOverflowQueue[T]) feedOutputChan() {
+	defer func() {
+		close(q.outputChan)
+		q.wg.Done()
+	}()
+
+	for {
+		select {
+		case nextTask, ok := <-q.memQueue:
+			// If the memQueue is closed, then the queue is
+			// stopping.
+			if !ok {
+				return
+			}
+
+			select {
+			case q.outputChan <- nextTask:
+			case <-q.quit:
+				q.leftOverItem1 = &nextTask
+				return
+			}
+
+		case <-q.quit:
+			return
+		}
+	}
+}
--- a/watchtower/wtclient/queue_test.go
+++ b/watchtower/wtclient/queue_test.go
@ -0,0 +1,435 @@
+package wtclient
+
+import (
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/btcsuite/btclog"
+	"github.com/lightningnetwork/lnd/kvdb"
+	"github.com/lightningnetwork/lnd/lntest/wait"
+	"github.com/lightningnetwork/lnd/watchtower/wtdb"
+	"github.com/lightningnetwork/lnd/watchtower/wtmock"
+	"github.com/stretchr/testify/require"
+)
+
+const (
+	maxInMemItems = 5
+	waitTime      = time.Second * 2
+)
+
+type initQueue func(t *testing.T) wtdb.Queue[*wtdb.BackupID]
+
+// TestDiskOverflowQueue tests that the DiskOverflowQueue behaves as expected.
+func TestDiskOverflowQueue(t *testing.T) {
+	t.Parallel()
+
+	dbs := []struct {
+		name string
+		init initQueue
+	}{
+		{
+			name: "kvdb",
+			init: func(t *testing.T) wtdb.Queue[*wtdb.BackupID] {
+				dbCfg := &kvdb.BoltConfig{
+					DBTimeout: kvdb.DefaultDBTimeout,
+				}
+
+				bdb, err := wtdb.NewBoltBackendCreator(
+					true, t.TempDir(), "wtclient.db",
+				)(dbCfg)
+				require.NoError(t, err)
+
+				db, err := wtdb.OpenClientDB(bdb)
+				require.NoError(t, err)
+
+				t.Cleanup(func() {
+					db.Close()
+				})
+
+				return db.GetDBQueue([]byte("test-namespace"))
+			},
+		},
+		{
+			name: "mock",
+			init: func(t *testing.T) wtdb.Queue[*wtdb.BackupID] {
+				db := wtmock.NewClientDB()
+
+				return db.GetDBQueue([]byte("test-namespace"))
+			},
+		},
+	}
+
+	tests := []struct {
+		name string
+		run  func(*testing.T, initQueue)
+	}{
+		{
+			name: "overflow to disk",
+			run:  testOverflowToDisk,
+		},
+		{
+			name: "startup with smaller buffer size",
+			run:  testRestartWithSmallerBufferSize,
+		},
+		{
+			name: "start stop queue",
+			run:  testStartStopQueue,
+		},
+	}
+
+	for _, database := range dbs {
+		db := database
+		t.Run(db.name, func(t *testing.T) {
+			t.Parallel()
+
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					test.run(t, db.init)
+				})
+			}
+		})
+	}
+}
+
+// testOverflowToDisk is a basic test that ensures that the queue correctly
+// overflows items to disk and then correctly reloads them.
+func testOverflowToDisk(t *testing.T, initQueue initQueue) {
+	// Generate some backup IDs that we want to add to the queue.
+	tasks := genBackupIDs(10)
+
+	// Init the DB.
+	db := initQueue(t)
+
+	// New mock logger.
+	log := newMockLogger(t.Logf)
+
+	// Init the queue with the mock DB.
+	q, err := NewDiskOverflowQueue[*wtdb.BackupID](
+		db, maxInMemItems, log,
+	)
+	require.NoError(t, err)
+
+	// Start the queue.
+	require.NoError(t, q.Start())
+
+	// Initially there should be no items on disk.
+	assertNumDisk(t, db, 0)
+
+	// Start filling up the queue.
+	enqueue(t, q, tasks[0])
+	enqueue(t, q, tasks[1])
+	enqueue(t, q, tasks[2])
+	enqueue(t, q, tasks[3])
+	enqueue(t, q, tasks[4])
+
+	// The queue should now be full, so any new items should be persisted to
+	// disk.
+	enqueue(t, q, tasks[5])
+	waitForNumDisk(t, db, 1)
+
+	// Now pop all items from the queue to ensure that the item
+	// from disk is loaded in properly once there is space.
+	require.Equal(t, tasks[0], getNext(t, q, 0))
+	require.Equal(t, tasks[1], getNext(t, q, 1))
+	require.Equal(t, tasks[2], getNext(t, q, 2))
+	require.Equal(t, tasks[3], getNext(t, q, 3))
+	require.Equal(t, tasks[4], getNext(t, q, 4))
+	require.Equal(t, tasks[5], getNext(t, q, 5))
+
+	// There should no longer be any items in the disk queue.
+	assertNumDisk(t, db, 0)
+
+	require.NoError(t, q.Stop())
+}
+
+// testRestartWithSmallerBufferSize tests that if the queue is restarted with
+// a smaller in-memory buffer size that it was initially started with, then
+// tasks are still loaded in the correct order.
+func testRestartWithSmallerBufferSize(t *testing.T, newQueue initQueue) {
+	const (
+		firstMaxInMemItems  = 5
+		secondMaxInMemItems = 2
+	)
+
+	// Generate some backup IDs that we want to add to the queue.
+	tasks := genBackupIDs(10)
+
+	// Create a db.
+	db := newQueue(t)
+
+	// New mock logger.
+	log := newMockLogger(t.Logf)
+
+	// Init the queue with the mock DB and an initial max in-mem
+	// items number.
+	q, err := NewDiskOverflowQueue[*wtdb.BackupID](
+		db, firstMaxInMemItems, log,
+	)
+	require.NoError(t, err)
+	require.NoError(t, q.Start())
+
+	// Add 7 items to the queue. The first 5 will go into the in-mem
+	// queue, the other 2 will be persisted to the main disk queue.
+	enqueue(t, q, tasks[0])
+	enqueue(t, q, tasks[1])
+	enqueue(t, q, tasks[2])
+	enqueue(t, q, tasks[3])
+	enqueue(t, q, tasks[4])
+	enqueue(t, q, tasks[5])
+	enqueue(t, q, tasks[6])
+
+	waitForNumDisk(t, db, 2)
+
+	// Now stop the queue and re-initialise it with a smaller
+	// buffer maximum.
+	require.NoError(t, q.Stop())
+
+	// Check that there are now 7 items in the disk queue.
+	waitForNumDisk(t, db, 7)
+
+	// Re-init the queue with a smaller max buffer size.
+	q, err = NewDiskOverflowQueue[*wtdb.BackupID](
+		db, secondMaxInMemItems, log,
+	)
+	require.NoError(t, err)
+	require.NoError(t, q.Start())
+
+	// Once more we shall repeat the above restart process just to ensure
+	// that in-memory items are correctly re-written and read from the DB.
+	waitForNumDisk(t, db, 5)
+	require.NoError(t, q.Stop())
+	waitForNumDisk(t, db, 7)
+	q, err = NewDiskOverflowQueue[*wtdb.BackupID](
+		db, secondMaxInMemItems, log,
+	)
+	require.NoError(t, err)
+	require.NoError(t, q.Start())
+	waitForNumDisk(t, db, 5)
+
+	// Make sure that items are popped off the queue in the correct
+	// order.
+	require.Equal(t, tasks[0], getNext(t, q, 0))
+	require.Equal(t, tasks[1], getNext(t, q, 1))
+	require.Equal(t, tasks[2], getNext(t, q, 2))
+	require.Equal(t, tasks[3], getNext(t, q, 3))
+	require.Equal(t, tasks[4], getNext(t, q, 4))
+	require.Equal(t, tasks[5], getNext(t, q, 5))
+	require.Equal(t, tasks[6], getNext(t, q, 6))
+
+	require.NoError(t, q.Stop())
+}
+
+// testStartStopQueue is a stress test that pushes a large number of tasks
+// through the queue while also restarting the queue a couple of times
+// throughout.
+func testStartStopQueue(t *testing.T, newQueue initQueue) {
+	// Generate a lot of backup IDs that we want to add to the
+	// queue one after the other.
+	tasks := genBackupIDs(200_000)
+
+	// Construct the ClientDB.
+	db := newQueue(t)
+
+	// New mock logger.
+	log := newMockLogger(t.Logf)
+
+	// Init the queue with the mock DB.
+	q, err := NewDiskOverflowQueue[*wtdb.BackupID](
+		db, DefaultMaxTasksInMemQueue, log,
+	)
+	require.NoError(t, err)
+
+	// Start the queue.
+	require.NoError(t, q.Start())
+
+	// Initially there should be no items on disk.
+	assertNumDisk(t, db, 0)
+
+	// We need to guard the queue with a mutex since we will be
+	// stopping, re-creating and starting the queue multiple times.
+	var (
+		queueMtx sync.RWMutex
+		wg       sync.WaitGroup
+		sendDone = make(chan struct{})
+	)
+
+	// This goroutine will constantly try to add new items to the
+	// queue, even if the queue is stopped.
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+
+		for idx := range tasks {
+			queueMtx.RLock()
+			err := q.QueueBackupID(tasks[idx])
+			require.NoError(t, err)
+			queueMtx.RUnlock()
+		}
+	}()
+
+	// This goroutine will repeatedly stop, re-create and start the
+	// queue until we're done sending items.
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+
+		numRestarts := 0
+		for {
+			select {
+			case <-sendDone:
+				t.Logf("Restarted queue %d times",
+					numRestarts)
+
+				return
+			case <-time.After(100 * time.Millisecond):
+			}
+
+			queueMtx.Lock()
+			require.NoError(t, q.Stop())
+			q, err = NewDiskOverflowQueue[*wtdb.BackupID](
+				db, DefaultMaxTasksInMemQueue, log,
+			)
+			require.NoError(t, err)
+			require.NoError(t, q.Start())
+			queueMtx.Unlock()
+
+			numRestarts++
+		}
+	}()
+
+	// We should be able to read all items from the queue, not being
+	// affected by restarts, other than needing to wait for the
+	// queue to be started again.
+	results := make([]*wtdb.BackupID, 0, len(tasks))
+	for i := 0; i < len(tasks); i++ {
+		queueMtx.RLock()
+		task := getNext(t, q, i)
+		queueMtx.RUnlock()
+
+		results = append(results, task)
+	}
+	close(sendDone)
+	require.Equal(t, tasks, results)
+
+	require.NoError(t, q.Stop())
+	wg.Wait()
+}
+
+func getNext(t *testing.T, q *DiskOverflowQueue[*wtdb.BackupID],
+	i int) *wtdb.BackupID {
+
+	var item *wtdb.BackupID
+	select {
+	case item = <-q.NextBackupID():
+	case <-time.After(waitTime):
+		t.Fatalf("task %d not received in time", i)
+	}
+
+	return item
+}
+
+func enqueue(t *testing.T, q *DiskOverflowQueue[*wtdb.BackupID],
+	task *wtdb.BackupID) {
+
+	err := q.QueueBackupID(task)
+	require.NoError(t, err)
+}
+
+func waitForNumDisk(t *testing.T, db wtdb.Queue[*wtdb.BackupID], num int) {
+	err := wait.Predicate(func() bool {
+		n, err := db.Len()
+		require.NoError(t, err)
+
+		return n == uint64(num)
+	}, waitTime)
+	require.NoError(t, err)
+}
+
+func assertNumDisk(t *testing.T, db wtdb.Queue[*wtdb.BackupID], num int) {
+	n, err := db.Len()
+	require.NoError(t, err)
+	require.EqualValues(t, num, n)
+}
+
+func genBackupIDs(num int) []*wtdb.BackupID {
+	ids := make([]*wtdb.BackupID, num)
+	for i := 0; i < num; i++ {
+		ids[i] = newBackupID(i)
+	}
+
+	return ids
+}
+
+func newBackupID(id int) *wtdb.BackupID {
+	return &wtdb.BackupID{CommitHeight: uint64(id)}
+}
+
+// BenchmarkDiskOverflowQueue benchmarks the performance of adding and removing
+// items from the DiskOverflowQueue using an in-memory disk db.
+func BenchmarkDiskOverflowQueue(b *testing.B) {
+	enqueue := func(q *DiskOverflowQueue[*wtdb.BackupID],
+		task *wtdb.BackupID) {
+
+		err := q.QueueBackupID(task)
+		require.NoError(b, err)
+	}
+
+	getNext := func(q *DiskOverflowQueue[*wtdb.BackupID],
+		i int) *wtdb.BackupID {
+
+		var item *wtdb.BackupID
+		select {
+		case item = <-q.NextBackupID():
+		case <-time.After(time.Second * 2):
+			b.Fatalf("task %d not received in time", i)
+		}
+
+		return item
+	}
+
+	// Generate some backup IDs that we want to add to the queue.
+	tasks := genBackupIDs(b.N)
+
+	// Create a mock db.
+	db := wtmock.NewQueueDB[*wtdb.BackupID]()
+
+	// New mock logger.
+	log := newMockLogger(b.Logf)
+
+	// Init the queue with the mock DB.
+	q, err := NewDiskOverflowQueue[*wtdb.BackupID](db, 5, log)
+	require.NoError(b, err)
+
+	// Start the queue.
+	require.NoError(b, q.Start())
+
+	// Start filling up the queue.
+	for n := 0; n < b.N; n++ {
+		enqueue(q, tasks[n])
+	}
+
+	// Pop all the items off the queue.
+	for n := 0; n < b.N; n++ {
+		require.Equal(b, tasks[n], getNext(q, n))
+	}
+
+	require.NoError(b, q.Stop())
+}
+
+type mockLogger struct {
+	log func(string, ...any)
+
+	btclog.Logger
+}
+
+func newMockLogger(logger func(string, ...any)) *mockLogger {
+	return &mockLogger{log: logger}
+}
+
+// Errorf formats message according to format specifier and writes to log.
+//
+// NOTE: this is part of the btclog.Logger interface.
+func (l *mockLogger) Errorf(format string, params ...any) {
+	l.log("[ERR]: "+format, params...)
+}