lnd/chanfitness/chanevent.go
carla 6cf66aea47
chanfitness: cool down flap count for rate limiting
Since we store all-time flap count for a peer, we add a cooldown factor
which will discount poor flap counts in the past. This is only applied
to peers that have not flapped for at least a cooldown period, so that
we do not downgrade our rate limiting for badly behaved peers.
2020-09-08 15:01:02 +02:00

419 lines
13 KiB
Go

package chanfitness
import (
"fmt"
"time"
"github.com/btcsuite/btcd/wire"
"github.com/lightningnetwork/lnd/clock"
)
type eventType int
const (
peerOnlineEvent eventType = iota
peerOfflineEvent
)
// String provides string representations of channel events.
func (e eventType) String() string {
switch e {
case peerOnlineEvent:
return "peer_online"
case peerOfflineEvent:
return "peer_offline"
}
return "unknown"
}
type event struct {
timestamp time.Time
eventType eventType
}
// peerLog tracks events for a peer and its channels. If we currently have no
// channels with the peer, it will simply track its current online state. If we
// do have channels open with the peer, it will track the peer's online and
// offline events so that we can calculate uptime for our channels. A single
// event log is used for these online and offline events, and uptime for a
// channel is calculated by examining a subsection of this log.
type peerLog struct {
// online stores whether the peer is currently online.
online bool
// onlineEvents is a log of timestamped events observed for the peer
// that we have committed to allocating memory to.
onlineEvents []*event
// stagedEvent represents an event that is pending addition to the
// events list. It has not yet been added because we rate limit the
// frequency that we store events at. We need to store this value
// in the log (rather than just ignore events) so that we can flush the
// aggregate outcome to our event log once the rate limiting period has
// ended.
//
// Take the following example:
// - Peer online event recorded
// - Peer offline event, not recorded due to rate limit
// - No more events, we incorrectly believe our peer to be online
// Instead of skipping events, we stage the most recent event during the
// rate limited period so that we know what happened (on aggregate)
// while we were rate limiting events.
//
// Note that we currently only store offline/online events so we can
// use this field to track our online state. With the addition of other
// event types, we need to only stage online/offline events, or split
// them out.
stagedEvent *event
// flapCount is the number of times this peer has been observed as
// going offline.
flapCount int
// lastFlap is the timestamp of the last flap we recorded for the peer.
// This value will be nil if we have never recorded a flap for the peer.
lastFlap *time.Time
// clock allows creation of deterministic unit tests.
clock clock.Clock
// channels contains a set of currently open channels. Channels will be
// added and removed from this map as they are opened and closed.
channels map[wire.OutPoint]*channelInfo
}
// newPeerLog creates a log for a peer, taking its historical flap count and
// last flap time as parameters. These values may be zero/nil if we have no
// record of historical flap count for the peer.
func newPeerLog(clock clock.Clock, flapCount int,
lastFlap *time.Time) *peerLog {
return &peerLog{
clock: clock,
flapCount: flapCount,
lastFlap: lastFlap,
channels: make(map[wire.OutPoint]*channelInfo),
}
}
// channelInfo contains information about a channel.
type channelInfo struct {
// openedAt tracks the first time this channel was seen. This is not
// necessarily the time that it confirmed on chain because channel
// events are not persisted at present.
openedAt time.Time
}
func newChannelInfo(openedAt time.Time) *channelInfo {
return &channelInfo{
openedAt: openedAt,
}
}
// onlineEvent records a peer online or offline event in the log and increments
// the peer's flap count.
func (p *peerLog) onlineEvent(online bool) {
eventTime := p.clock.Now()
// If we have a non-nil last flap time, potentially apply a cooldown
// factor to the peer's flap count before we rate limit it. This allows
// us to decrease the penalty for historical flaps over time, provided
// the peer has not flapped for a while.
if p.lastFlap != nil {
p.flapCount = cooldownFlapCount(
p.clock.Now(), p.flapCount, *p.lastFlap,
)
}
// Record flap count information and online state regardless of whether
// we have any channels open with this peer.
p.flapCount++
p.lastFlap = &eventTime
p.online = online
// If we have no channels currently open with the peer, we do not want
// to commit resources to tracking their online state beyond a simple
// online boolean, so we exit early.
if p.channelCount() == 0 {
return
}
p.addEvent(online, eventTime)
}
// addEvent records an online or offline event in our event log. and increments
// the peer's flap count.
func (p *peerLog) addEvent(online bool, time time.Time) {
eventType := peerOnlineEvent
if !online {
eventType = peerOfflineEvent
}
event := &event{
timestamp: time,
eventType: eventType,
}
// If we have no staged events, we can just stage this event and return.
if p.stagedEvent == nil {
p.stagedEvent = event
return
}
// We get the amount of time we require between events according to
// peer flap count.
aggregation := getRateLimit(p.flapCount)
nextRecordTime := p.stagedEvent.timestamp.Add(aggregation)
flushEvent := nextRecordTime.Before(event.timestamp)
// If enough time has passed since our last staged event, we add our
// event to our in-memory list.
if flushEvent {
p.onlineEvents = append(p.onlineEvents, p.stagedEvent)
}
// Finally, we replace our staged event with the new event we received.
p.stagedEvent = event
}
// addChannel adds a channel to our log. If we have not tracked any online
// events for our peer yet, we create one with our peer's current online state
// so that we know the state that the peer had at channel start, which is
// required to calculate uptime over the channel's lifetime.
func (p *peerLog) addChannel(channelPoint wire.OutPoint) error {
_, ok := p.channels[channelPoint]
if ok {
return fmt.Errorf("channel: %v already present", channelPoint)
}
openTime := p.clock.Now()
p.channels[channelPoint] = newChannelInfo(openTime)
// If we do not have any online events tracked for our peer (which is
// the case when we have no other channels open with the peer), we add
// an event with the peer's current online state so that we know that
// starting state for this peer when a channel was connected (which
// allows us to calculate uptime over the lifetime of the channel).
if len(p.onlineEvents) == 0 {
p.addEvent(p.online, openTime)
}
return nil
}
// removeChannel removes a channel from our log. If we have no more channels
// with the peer after removing this one, we clear our list of events.
func (p *peerLog) removeChannel(channelPoint wire.OutPoint) error {
_, ok := p.channels[channelPoint]
if !ok {
return fmt.Errorf("channel: %v not present", channelPoint)
}
delete(p.channels, channelPoint)
// If we have no more channels in our event log, we can discard all of
// our online events in memory, since we don't need them anymore.
// TODO(carla): this could be done on a per channel basis.
if p.channelCount() == 0 {
p.onlineEvents = nil
p.stagedEvent = nil
}
return nil
}
// channelCount returns the number of channels that we currently have
// with the peer.
func (p *peerLog) channelCount() int {
return len(p.channels)
}
// channelUptime looks up a channel and returns the amount of time that the
// channel has been monitored for and its uptime over this period.
func (p *peerLog) channelUptime(channelPoint wire.OutPoint) (time.Duration,
time.Duration, error) {
channel, ok := p.channels[channelPoint]
if !ok {
return 0, 0, ErrChannelNotFound
}
now := p.clock.Now()
uptime, err := p.uptime(channel.openedAt, now)
if err != nil {
return 0, 0, err
}
return now.Sub(channel.openedAt), uptime, nil
}
// getFlapCount returns the peer's flap count and the timestamp that we last
// recorded a flap.
func (p *peerLog) getFlapCount() (int, *time.Time) {
return p.flapCount, p.lastFlap
}
// listEvents returns all of the events that our event log has tracked,
// including events that are staged for addition to our set of events but have
// not yet been committed to (because we rate limit and store only the aggregate
// outcome over a period).
func (p *peerLog) listEvents() []*event {
if p.stagedEvent == nil {
return p.onlineEvents
}
return append(p.onlineEvents, p.stagedEvent)
}
// onlinePeriod represents a period of time over which a peer was online.
type onlinePeriod struct {
start, end time.Time
}
// getOnlinePeriods returns a list of all the periods that the event log has
// recorded the remote peer as being online. In the unexpected case where there
// are no events, the function returns early. Online periods are defined as a
// peer online event which is terminated by a peer offline event. If the event
// log ends on a peer online event, it appends a final period which is
// calculated until the present. This function expects the event log provided
// to be ordered by ascending timestamp, and can tolerate multiple consecutive
// online or offline events.
func (p *peerLog) getOnlinePeriods() []*onlinePeriod {
events := p.listEvents()
// Return early if there are no events, there are no online periods.
if len(events) == 0 {
return nil
}
var (
// lastEvent tracks the last event that we had that was of
// a different type to our own. It is used to determine the
// start time of our online periods when we experience an
// offline event, and to track our last recorded state.
lastEvent *event
onlinePeriods []*onlinePeriod
)
// Loop through all events to build a list of periods that the peer was
// online. Online periods are added when they are terminated with a peer
// offline event. If the log ends on an online event, the period between
// the online event and the present is not tracked. The type of the most
// recent event is tracked using the offline bool so that we can add a
// final online period if necessary.
for _, event := range events {
switch event.eventType {
case peerOnlineEvent:
// If our previous event is nil, we just set it and
// break out of the switch.
if lastEvent == nil {
lastEvent = event
break
}
// If our previous event was an offline event, we update
// it to this event. We do not do this if it was an
// online event because duplicate online events would
// progress our online timestamp forward (rather than
// keep it at our earliest online event timestamp).
if lastEvent.eventType == peerOfflineEvent {
lastEvent = event
}
case peerOfflineEvent:
// If our previous event is nil, we just set it and
// break out of the switch since we cannot record an
// online period from this single event.
if lastEvent == nil {
lastEvent = event
break
}
// If the last event we saw was an online event, we
// add an online period to our set and progress our
// previous event to this offline event. We do not
// do this if we have had duplicate offline events
// because we would be tracking the most recent offline
// event (rather than keep it at our earliest offline
// event timestamp).
if lastEvent.eventType == peerOnlineEvent {
onlinePeriods = append(
onlinePeriods, &onlinePeriod{
start: lastEvent.timestamp,
end: event.timestamp,
},
)
lastEvent = event
}
}
}
// If the last event was an peer offline event, we do not need to
// calculate a final online period and can return online periods as is.
if lastEvent.eventType == peerOfflineEvent {
return onlinePeriods
}
// The log ended on an online event, so we need to add a final online
// period which terminates at the present.
finalEvent := &onlinePeriod{
start: lastEvent.timestamp,
end: p.clock.Now(),
}
// Add the final online period to the set and return.
return append(onlinePeriods, finalEvent)
}
// uptime calculates the total uptime we have recorded for a peer over the
// inclusive range specified. An error is returned if the end of the range is
// before the start or a zero end time is returned.
func (p *peerLog) uptime(start, end time.Time) (time.Duration, error) {
// Error if we are provided with an invalid range to calculate uptime
// for.
if end.Before(start) {
return 0, fmt.Errorf("end time: %v before start time: %v",
end, start)
}
if end.IsZero() {
return 0, fmt.Errorf("zero end time")
}
var uptime time.Duration
for _, p := range p.getOnlinePeriods() {
// The online period ends before the range we're looking at, so
// we can skip over it.
if p.end.Before(start) {
continue
}
// The online period starts after the range we're looking at, so
// can stop calculating uptime.
if p.start.After(end) {
break
}
// If the online period starts before our range, shift the start
// time up so that we only calculate uptime from the start of
// our range.
if p.start.Before(start) {
p.start = start
}
// If the online period ends before our range, shift the end
// time forward so that we only calculate uptime until the end
// of the range.
if p.end.After(end) {
p.end = end
}
uptime += p.end.Sub(p.start)
}
return uptime, nil
}