lnd/chanfitness/chanevent.go

package chanfitness

import (
	"fmt"
	"time"

	"github.com/btcsuite/btcd/wire"
	"github.com/lightningnetwork/lnd/clock"
)

type eventType int

const (
	peerOnlineEvent eventType = iota
	peerOfflineEvent
)

// String provides string representations of channel events.
func (e eventType) String() string {
	switch e {
	case peerOnlineEvent:
		return "peer_online"

	case peerOfflineEvent:
		return "peer_offline"
	}

	return "unknown"
}

type event struct {
	timestamp time.Time
	eventType eventType
}

// peerLog tracks events for a peer and its channels. If we currently have no
// channels with the peer, it will simply track its current online state. If we
// do have channels open with the peer, it will track the peer's online and
// offline events so that we can calculate uptime for our channels. A single
// event log is used for these online and offline events, and uptime for a
// channel is calculated by examining a subsection of this log.
type peerLog struct {
	// online stores whether the peer is currently online.
	online bool

	// onlineEvents is a log of timestamped events observed for the peer
	// that we have committed to allocating memory to.
	onlineEvents []*event

	// stagedEvent represents an event that is pending addition to the
	// events list. It has not yet been added because we rate limit the
	// frequency that we store events at. We need to store this value
	// in the log (rather than just ignore events) so that we can flush the
	// aggregate outcome to our event log once the rate limiting period has
	// ended.
	//
	// Take the following example:
	// - Peer online event recorded
	// - Peer offline event, not recorded due to rate limit
	// - No more events, we incorrectly believe our peer to be online
	// Instead of skipping events, we stage the most recent event during the
	// rate limited period so that we know what happened (on aggregate)
	// while we were rate limiting events.
	//
	// Note that we currently only store offline/online events so we can
	// use this field to track our online state. With the addition of other
	// event types, we need to only stage online/offline events, or split
	// them out.
	stagedEvent *event

	// flapCount is the number of times this peer has been observed as
	// going offline.
	flapCount int

	// lastFlap is the timestamp of the last flap we recorded for the peer.
	// This value will be nil if we have never recorded a flap for the peer.
	lastFlap *time.Time

	// clock allows creation of deterministic unit tests.
	clock clock.Clock

	// channels contains a set of currently open channels. Channels will be
	// added and removed from this map as they are opened and closed.
	channels map[wire.OutPoint]*channelInfo
}

// newPeerLog creates a log for a peer, taking its historical flap count and
// last flap time as parameters. These values may be zero/nil if we have no
// record of historical flap count for the peer.
func newPeerLog(clock clock.Clock, flapCount int,
	lastFlap *time.Time) *peerLog {

	return &peerLog{
		clock:     clock,
		flapCount: flapCount,
		lastFlap:  lastFlap,
		channels:  make(map[wire.OutPoint]*channelInfo),
	}
}

// channelInfo contains information about a channel.
type channelInfo struct {
	// openedAt tracks the first time this channel was seen. This is not
	// necessarily the time that it confirmed on chain because channel
	// events are not persisted at present.
	openedAt time.Time
}

func newChannelInfo(openedAt time.Time) *channelInfo {
	return &channelInfo{
		openedAt: openedAt,
	}
}

// onlineEvent records a peer online or offline event in the log and increments
// the peer's flap count.
func (p *peerLog) onlineEvent(online bool) {
	eventTime := p.clock.Now()

	// If we have a non-nil last flap time, potentially apply a cooldown
	// factor to the peer's flap count before we rate limit it. This allows
	// us to decrease the penalty for historical flaps over time, provided
	// the peer has not flapped for a while.
	if p.lastFlap != nil {
		p.flapCount = cooldownFlapCount(
			p.clock.Now(), p.flapCount, *p.lastFlap,
		)
	}

	// Record flap count information and online state regardless of whether
	// we have any channels open with this peer.
	p.flapCount++
	p.lastFlap = &eventTime
	p.online = online

	// If we have no channels currently open with the peer, we do not want
	// to commit resources to tracking their online state beyond a simple
	// online boolean, so we exit early.
	if p.channelCount() == 0 {
		return
	}

	p.addEvent(online, eventTime)
}

// addEvent records an online or offline event in our event log. and increments
// the peer's flap count.
func (p *peerLog) addEvent(online bool, time time.Time) {
	eventType := peerOnlineEvent
	if !online {
		eventType = peerOfflineEvent
	}

	event := &event{
		timestamp: time,
		eventType: eventType,
	}

	// If we have no staged events, we can just stage this event and return.
	if p.stagedEvent == nil {
		p.stagedEvent = event
		return
	}

	// We get the amount of time we require between events according to
	// peer flap count.
	aggregation := getRateLimit(p.flapCount)
	nextRecordTime := p.stagedEvent.timestamp.Add(aggregation)
	flushEvent := nextRecordTime.Before(event.timestamp)

	// If enough time has passed since our last staged event, we add our
	// event to our in-memory list.
	if flushEvent {
		p.onlineEvents = append(p.onlineEvents, p.stagedEvent)
	}

	// Finally, we replace our staged event with the new event we received.
	p.stagedEvent = event
}

// addChannel adds a channel to our log. If we have not tracked any online
// events for our peer yet, we create one with our peer's current online state
// so that we know the state that the peer had at channel start, which is
// required to calculate uptime over the channel's lifetime.
func (p *peerLog) addChannel(channelPoint wire.OutPoint) error {
	_, ok := p.channels[channelPoint]
	if ok {
		return fmt.Errorf("channel: %v already present", channelPoint)
	}

	openTime := p.clock.Now()
	p.channels[channelPoint] = newChannelInfo(openTime)

	// If we do not have any online events tracked for our peer (which is
	// the case when we have no other channels open with the peer), we add
	// an event with the peer's current online state so that we know that
	// starting state for this peer when a channel was connected (which
	// allows us to calculate uptime over the lifetime of the channel).
	if len(p.onlineEvents) == 0 {
		p.addEvent(p.online, openTime)
	}

	return nil
}

// removeChannel removes a channel from our log. If we have no more channels
// with the peer after removing this one, we clear our list of events.
func (p *peerLog) removeChannel(channelPoint wire.OutPoint) error {
	_, ok := p.channels[channelPoint]
	if !ok {
		return fmt.Errorf("channel: %v not present", channelPoint)
	}

	delete(p.channels, channelPoint)

	// If we have no more channels in our event log, we can discard all of
	// our online events in memory, since we don't need them anymore.
	// TODO(carla): this could be done on a per channel basis.
	if p.channelCount() == 0 {
		p.onlineEvents = nil
		p.stagedEvent = nil
	}

	return nil
}

// channelCount returns the number of channels that we currently have
// with the peer.
func (p *peerLog) channelCount() int {
	return len(p.channels)
}

// channelUptime looks up a channel and returns the amount of time that the
// channel has been monitored for and its uptime over this period.
func (p *peerLog) channelUptime(channelPoint wire.OutPoint) (time.Duration,
	time.Duration, error) {

	channel, ok := p.channels[channelPoint]
	if !ok {
		return 0, 0, ErrChannelNotFound
	}

	now := p.clock.Now()

	uptime, err := p.uptime(channel.openedAt, now)
	if err != nil {
		return 0, 0, err
	}

	return now.Sub(channel.openedAt), uptime, nil
}

// getFlapCount returns the peer's flap count and the timestamp that we last
// recorded a flap.
func (p *peerLog) getFlapCount() (int, *time.Time) {
	return p.flapCount, p.lastFlap
}

// listEvents returns all of the events that our event log has tracked,
// including events that are staged for addition to our set of events but have
// not yet been committed to (because we rate limit and store only the aggregate
// outcome over a period).
func (p *peerLog) listEvents() []*event {
	if p.stagedEvent == nil {
		return p.onlineEvents
	}

	return append(p.onlineEvents, p.stagedEvent)
}

// onlinePeriod represents a period of time over which a peer was online.
type onlinePeriod struct {
	start, end time.Time
}

// getOnlinePeriods returns a list of all the periods that the event log has
// recorded the remote peer as being online. In the unexpected case where there
// are no events, the function returns early. Online periods are defined as a
// peer online event which is terminated by a peer offline event. If the event
// log ends on a peer online event, it appends a final period which is
// calculated until the present. This function expects the event log provided
// to be ordered by ascending timestamp, and can tolerate multiple consecutive
// online or offline events.
func (p *peerLog) getOnlinePeriods() []*onlinePeriod {
	events := p.listEvents()

	// Return early if there are no events, there are no online periods.
	if len(events) == 0 {
		return nil
	}

	var (
		// lastEvent tracks the last event that we had that was of
		// a different type to our own. It is used to determine the
		// start time of our online periods when we experience an
		// offline event, and to track our last recorded state.
		lastEvent     *event
		onlinePeriods []*onlinePeriod
	)

	// Loop through all events to build a list of periods that the peer was
	// online. Online periods are added when they are terminated with a peer
	// offline event. If the log ends on an online event, the period between
	// the online event and the present is not tracked. The type of the most
	// recent event is tracked using the offline bool so that we can add a
	// final online period if necessary.
	for _, event := range events {
		switch event.eventType {
		case peerOnlineEvent:
			// If our previous event is nil, we just set it and
			// break out of the switch.
			if lastEvent == nil {
				lastEvent = event
				break
			}

			// If our previous event was an offline event, we update
			// it to this event. We do not do this if it was an
			// online event because duplicate online events would
			// progress our online timestamp forward (rather than
			// keep it at our earliest online event timestamp).
			if lastEvent.eventType == peerOfflineEvent {
				lastEvent = event
			}

		case peerOfflineEvent:
			// If our previous event is nil, we just set it and
			// break out of the switch since we cannot record an
			// online period from this single event.
			if lastEvent == nil {
				lastEvent = event
				break
			}

			// If the last event we saw was an online event, we
			// add an online period to our set and progress our
			// previous event to this offline event. We do not
			// do this if we have had duplicate offline events
			// because we would be tracking the most recent offline
			// event (rather than keep it at our earliest offline
			// event timestamp).
			if lastEvent.eventType == peerOnlineEvent {
				onlinePeriods = append(
					onlinePeriods, &onlinePeriod{
						start: lastEvent.timestamp,
						end:   event.timestamp,
					},
				)

				lastEvent = event
			}
		}
	}

	// If the last event was an peer offline event, we do not need to
	// calculate a final online period and can return online periods as is.
	if lastEvent.eventType == peerOfflineEvent {
		return onlinePeriods
	}

	// The log ended on an online event, so we need to add a final online
	// period which terminates at the present.
	finalEvent := &onlinePeriod{
		start: lastEvent.timestamp,
		end:   p.clock.Now(),
	}

	// Add the final online period to the set and return.
	return append(onlinePeriods, finalEvent)
}

// uptime calculates the total uptime we have recorded for a peer over the
// inclusive range specified. An error is returned if the end of the range is
// before the start or a zero end time is returned.
func (p *peerLog) uptime(start, end time.Time) (time.Duration, error) {
	// Error if we are provided with an invalid range to calculate uptime
	// for.
	if end.Before(start) {
		return 0, fmt.Errorf("end time: %v before start time: %v",
			end, start)
	}
	if end.IsZero() {
		return 0, fmt.Errorf("zero end time")
	}

	var uptime time.Duration

	for _, p := range p.getOnlinePeriods() {
		// The online period ends before the range we're looking at, so
		// we can skip over it.
		if p.end.Before(start) {
			continue
		}
		// The online period starts after the range we're looking at, so
		// can stop calculating uptime.
		if p.start.After(end) {
			break
		}

		// If the online period starts before our range, shift the start
		// time up so that we only calculate uptime from the start of
		// our range.
		if p.start.Before(start) {
			p.start = start
		}

		// If the online period ends after our range, shift the end
		// time forward so that we only calculate uptime until the end
		// of the range.
		if p.end.After(end) {
			p.end = end
		}

		uptime += p.end.Sub(p.start)
	}

	return uptime, nil
}