consul/agent/grpc-external/services/peerstream/stream_tracker.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package peerstream

import (
	"fmt"
	"sync"
	"time"

	"github.com/hashicorp/consul/agent/structs"
)

// Tracker contains a map of (PeerID -> MutableStatus).
// As streams are opened and closed we track details about their status.
type Tracker struct {
	mu      sync.RWMutex
	streams map[string]*MutableStatus

	// heartbeatTimeout is the max duration a connection is allowed to be
	// disconnected before the stream health is reported as non-healthy
	heartbeatTimeout time.Duration

	// timeNow is a shim for testing.
	timeNow func() time.Time
}

func NewTracker(heartbeatTimeout time.Duration) *Tracker {
	if heartbeatTimeout == 0 {
		heartbeatTimeout = defaultIncomingHeartbeatTimeout
	}
	return &Tracker{
		streams:          make(map[string]*MutableStatus),
		timeNow:          time.Now,
		heartbeatTimeout: heartbeatTimeout,
	}
}

// setClock is used for debugging purposes only.
func (t *Tracker) setClock(clock func() time.Time) {
	if clock == nil {
		t.timeNow = time.Now
	} else {
		t.timeNow = clock
	}
}

// Register a stream for a given peer but do not mark it as connected.
func (t *Tracker) Register(id string) (*MutableStatus, error) {
	t.mu.Lock()
	defer t.mu.Unlock()
	status, _, err := t.registerLocked(id, false)
	return status, err
}

func (t *Tracker) registerLocked(id string, initAsConnected bool) (*MutableStatus, bool, error) {
	status, ok := t.streams[id]
	if !ok {
		status = newMutableStatus(t.timeNow, initAsConnected)
		t.streams[id] = status
		return status, true, nil
	}
	return status, false, nil
}

// Connected registers a stream for a given peer, and marks it as connected.
// It also enforces that there is only one active stream for a peer.
func (t *Tracker) Connected(id string) (*MutableStatus, error) {
	t.mu.Lock()
	defer t.mu.Unlock()
	return t.connectedLocked(id)
}

func (t *Tracker) connectedLocked(id string) (*MutableStatus, error) {
	status, newlyRegistered, err := t.registerLocked(id, true)
	if err != nil {
		return nil, err
	} else if newlyRegistered {
		return status, nil
	}

	if status.IsConnected() {
		return nil, fmt.Errorf("there is an active stream for the given PeerID %q", id)
	}
	status.TrackConnected()

	return status, nil
}

// DisconnectedGracefully marks the peer id's stream status as disconnected gracefully.
func (t *Tracker) DisconnectedGracefully(id string) {
	t.mu.Lock()
	defer t.mu.Unlock()

	if status, ok := t.streams[id]; ok {
		status.TrackDisconnectedGracefully()
	}
}

// DisconnectedDueToError marks the peer id's stream status as disconnected due to an error.
func (t *Tracker) DisconnectedDueToError(id string, error string) {
	t.mu.Lock()
	defer t.mu.Unlock()

	if status, ok := t.streams[id]; ok {
		status.TrackDisconnectedDueToError(error)
	}
}

func (t *Tracker) StreamStatus(id string) (resp Status, found bool) {
	t.mu.RLock()
	defer t.mu.RUnlock()

	s, ok := t.streams[id]
	if !ok {
		return Status{
			NeverConnected: true,
		}, false
	}
	return s.GetStatus(), true
}

func (t *Tracker) ConnectedStreams() map[string]chan struct{} {
	t.mu.RLock()
	defer t.mu.RUnlock()

	resp := make(map[string]chan struct{})
	for peer, status := range t.streams {
		if status.IsConnected() {
			resp[peer] = status.doneCh
		}
	}
	return resp
}

func (t *Tracker) DeleteStatus(id string) {
	t.mu.Lock()
	defer t.mu.Unlock()

	delete(t.streams, id)
}

// IsHealthy is a calculates the health of a peering status.
// We define a peering as unhealthy if its status has been in the following
// states for longer than the configured incomingHeartbeatTimeout.
//   - If it is disconnected
//   - If the last received Nack is newer than last received Ack
//   - If the last received error is newer than last received success
//
// If none of these conditions apply, we call the peering healthy.
func (t *Tracker) IsHealthy(s Status) bool {
	// If stream is in a disconnected state for longer than the configured
	// heartbeat timeout, report as unhealthy.
	if s.DisconnectTime != nil &&
		t.timeNow().Sub(*s.DisconnectTime) > t.heartbeatTimeout {
		return false
	}

	// If last Nack is after last Ack, it means the peer is unable to
	// handle our replication message
	if s.LastAck == nil {
		s.LastAck = &time.Time{}
	}

	if s.LastNack != nil &&
		s.LastNack.After(*s.LastAck) &&
		t.timeNow().Sub(*s.LastAck) > t.heartbeatTimeout {
		return false
	}

	// If last recv error is newer than last recv success, we were unable
	// to handle the peer's replication message.
	if s.LastRecvResourceSuccess == nil {
		s.LastRecvResourceSuccess = &time.Time{}
	}

	if s.LastRecvError != nil &&
		s.LastRecvError.After(*s.LastRecvResourceSuccess) &&
		t.timeNow().Sub(*s.LastRecvError) > t.heartbeatTimeout {
		return false
	}

	return true
}

type MutableStatus struct {
	mu sync.RWMutex

	// timeNow is a shim for testing.
	timeNow func() time.Time

	// doneCh allows for shutting down a stream gracefully by sending a termination message
	// to the peer before the stream's context is cancelled.
	doneCh chan struct{}

	Status
}

// Status contains information about the replication stream to a peer cluster.
// TODO(peering): There's a lot of fields here...
type Status struct {
	// Connected is true when there is an open stream for the peer.
	Connected bool

	// NeverConnected is true for peerings that have never connected, false otherwise.
	NeverConnected bool

	// DisconnectErrorMessage tracks the error that caused the stream to disconnect non-gracefully.
	// If the stream is connected or it disconnected gracefully it will be empty.
	DisconnectErrorMessage string

	// If the status is not connected, DisconnectTime tracks when the stream was closed. Else it's zero.
	DisconnectTime *time.Time

	// LastAck tracks the time we received the last ACK for a resource replicated TO the peer.
	LastAck *time.Time

	// LastNack tracks the time we received the last NACK for a resource replicated to the peer.
	LastNack *time.Time

	// LastNackMessage tracks the reported error message associated with the last NACK from a peer.
	LastNackMessage string

	// LastSendError tracks the time of the last error sending into the stream.
	LastSendError *time.Time

	// LastSendErrorMessage tracks the last error message when sending into the stream.
	LastSendErrorMessage string

	// LastSendSuccess tracks the time we last successfully sent a resource TO the peer.
	LastSendSuccess *time.Time

	// LastRecvHeartbeat tracks when we last received a heartbeat from our peer.
	LastRecvHeartbeat *time.Time

	// LastRecvResourceSuccess tracks the time we last successfully stored a resource replicated FROM the peer.
	LastRecvResourceSuccess *time.Time

	// LastRecvError tracks either:
	// - The time we failed to store a resource replicated FROM the peer.
	// - The time of the last error when receiving from the stream.
	LastRecvError *time.Time

	// LastRecvErrorMessage tracks the last error message when receiving from the stream.
	LastRecvErrorMessage string

	// TODO(peering): consider keeping track of imported and exported services thru raft
	// ImportedServices keeps track of which service names are imported for the peer
	ImportedServices []string
	// ExportedServices keeps track of which service names a peer asks to export
	ExportedServices []string
}

func (s *Status) GetImportedServicesCount() uint64 {
	return uint64(len(s.ImportedServices))
}

func (s *Status) GetExportedServicesCount() uint64 {
	return uint64(len(s.ExportedServices))
}

func newMutableStatus(now func() time.Time, connected bool) *MutableStatus {
	return &MutableStatus{
		Status: Status{
			Connected:      connected,
			NeverConnected: !connected,
		},
		timeNow: now,
		doneCh:  make(chan struct{}),
	}
}

func (s *MutableStatus) Done() <-chan struct{} {
	return s.doneCh
}

func (s *MutableStatus) TrackAck() {
	s.mu.Lock()
	s.LastAck = ptr(s.timeNow().UTC())
	s.mu.Unlock()
}

func (s *MutableStatus) TrackSendError(error string) {
	s.mu.Lock()
	s.LastSendError = ptr(s.timeNow().UTC())
	s.LastSendErrorMessage = error
	s.mu.Unlock()
}

func (s *MutableStatus) TrackSendSuccess() {
	s.mu.Lock()
	s.LastSendSuccess = ptr(s.timeNow().UTC())
	s.mu.Unlock()
}

// TrackRecvResourceSuccess tracks receiving a replicated resource.
func (s *MutableStatus) TrackRecvResourceSuccess() {
	s.mu.Lock()
	s.LastRecvResourceSuccess = ptr(s.timeNow().UTC())
	s.mu.Unlock()
}

// TrackRecvHeartbeat tracks receiving a heartbeat from our peer.
func (s *MutableStatus) TrackRecvHeartbeat() {
	s.mu.Lock()
	s.LastRecvHeartbeat = ptr(s.timeNow().UTC())
	s.mu.Unlock()
}

func (s *MutableStatus) TrackRecvError(error string) {
	s.mu.Lock()
	s.LastRecvError = ptr(s.timeNow().UTC())
	s.LastRecvErrorMessage = error
	s.mu.Unlock()
}

func (s *MutableStatus) TrackNack(msg string) {
	s.mu.Lock()
	s.LastNack = ptr(s.timeNow().UTC())
	s.LastNackMessage = msg
	s.mu.Unlock()
}

func (s *MutableStatus) TrackConnected() {
	s.mu.Lock()
	s.Connected = true
	s.DisconnectTime = &time.Time{}
	s.DisconnectErrorMessage = ""
	s.mu.Unlock()
}

// TrackDisconnectedGracefully tracks when the stream was disconnected in a way we expected.
// For example, we got a terminated message, or we terminated the stream ourselves.
func (s *MutableStatus) TrackDisconnectedGracefully() {
	s.mu.Lock()
	s.Connected = false
	s.DisconnectTime = ptr(s.timeNow().UTC())
	s.DisconnectErrorMessage = ""
	s.mu.Unlock()
}

// TrackDisconnectedDueToError tracks when the stream was disconnected due to an error.
// For example the heartbeat timed out, or we couldn't send into the stream.
func (s *MutableStatus) TrackDisconnectedDueToError(error string) {
	s.mu.Lock()
	s.Connected = false
	s.DisconnectTime = ptr(s.timeNow().UTC())
	s.DisconnectErrorMessage = error
	s.mu.Unlock()
}

func (s *MutableStatus) IsConnected() bool {
	var resp bool

	s.mu.RLock()
	resp = s.Connected
	s.mu.RUnlock()

	return resp
}

func (s *MutableStatus) GetStatus() Status {
	s.mu.RLock()
	copy := s.Status
	s.mu.RUnlock()

	return copy
}

func (s *MutableStatus) SetImportedServices(serviceNames []structs.ServiceName) {
	s.mu.Lock()
	defer s.mu.Unlock()

	s.ImportedServices = make([]string, len(serviceNames))
	for i, sn := range serviceNames {
		s.ImportedServices[i] = sn.String()
	}
}

func (s *MutableStatus) GetImportedServicesCount() int {
	s.mu.RLock()
	defer s.mu.RUnlock()

	return len(s.ImportedServices)
}

func (s *MutableStatus) SetExportedServices(serviceNames []structs.ServiceName) {
	s.mu.Lock()
	defer s.mu.Unlock()

	s.ExportedServices = make([]string, len(serviceNames))

	for i, sn := range serviceNames {
		s.ExportedServices[i] = sn.String()
	}
}

func (s *MutableStatus) GetExportedServicesCount() int {
	s.mu.RLock()
	defer s.mu.RUnlock()

	return len(s.ExportedServices)
}

func ptr[T any](x T) *T {
	return &x
}