mirror of
https://github.com/status-im/consul.git
synced 2025-01-22 03:29:43 +00:00
5fb9df1640
* Adding explicit MPL license for sub-package This directory and its subdirectories (packages) contain files licensed with the MPLv2 `LICENSE` file in this directory and are intentionally licensed separately from the BSL `LICENSE` file at the root of this repository. * Adding explicit MPL license for sub-package This directory and its subdirectories (packages) contain files licensed with the MPLv2 `LICENSE` file in this directory and are intentionally licensed separately from the BSL `LICENSE` file at the root of this repository. * Updating the license from MPL to Business Source License Going forward, this project will be licensed under the Business Source License v1.1. Please see our blog post for more details at <Blog URL>, FAQ at www.hashicorp.com/licensing-faq, and details of the license at www.hashicorp.com/bsl. * add missing license headers * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 --------- Co-authored-by: hashicorp-copywrite[bot] <110428419+hashicorp-copywrite[bot]@users.noreply.github.com>
408 lines
10 KiB
Go
408 lines
10 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package peerstream
|
|
|
|
import (
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/hashicorp/consul/agent/structs"
|
|
)
|
|
|
|
// Tracker contains a map of (PeerID -> MutableStatus).
|
|
// As streams are opened and closed we track details about their status.
|
|
type Tracker struct {
|
|
mu sync.RWMutex
|
|
streams map[string]*MutableStatus
|
|
|
|
// heartbeatTimeout is the max duration a connection is allowed to be
|
|
// disconnected before the stream health is reported as non-healthy
|
|
heartbeatTimeout time.Duration
|
|
|
|
// timeNow is a shim for testing.
|
|
timeNow func() time.Time
|
|
}
|
|
|
|
func NewTracker(heartbeatTimeout time.Duration) *Tracker {
|
|
if heartbeatTimeout == 0 {
|
|
heartbeatTimeout = defaultIncomingHeartbeatTimeout
|
|
}
|
|
return &Tracker{
|
|
streams: make(map[string]*MutableStatus),
|
|
timeNow: time.Now,
|
|
heartbeatTimeout: heartbeatTimeout,
|
|
}
|
|
}
|
|
|
|
// setClock is used for debugging purposes only.
|
|
func (t *Tracker) setClock(clock func() time.Time) {
|
|
if clock == nil {
|
|
t.timeNow = time.Now
|
|
} else {
|
|
t.timeNow = clock
|
|
}
|
|
}
|
|
|
|
// Register a stream for a given peer but do not mark it as connected.
|
|
func (t *Tracker) Register(id string) (*MutableStatus, error) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
status, _, err := t.registerLocked(id, false)
|
|
return status, err
|
|
}
|
|
|
|
func (t *Tracker) registerLocked(id string, initAsConnected bool) (*MutableStatus, bool, error) {
|
|
status, ok := t.streams[id]
|
|
if !ok {
|
|
status = newMutableStatus(t.timeNow, initAsConnected)
|
|
t.streams[id] = status
|
|
return status, true, nil
|
|
}
|
|
return status, false, nil
|
|
}
|
|
|
|
// Connected registers a stream for a given peer, and marks it as connected.
|
|
// It also enforces that there is only one active stream for a peer.
|
|
func (t *Tracker) Connected(id string) (*MutableStatus, error) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
return t.connectedLocked(id)
|
|
}
|
|
|
|
func (t *Tracker) connectedLocked(id string) (*MutableStatus, error) {
|
|
status, newlyRegistered, err := t.registerLocked(id, true)
|
|
if err != nil {
|
|
return nil, err
|
|
} else if newlyRegistered {
|
|
return status, nil
|
|
}
|
|
|
|
if status.IsConnected() {
|
|
return nil, fmt.Errorf("there is an active stream for the given PeerID %q", id)
|
|
}
|
|
status.TrackConnected()
|
|
|
|
return status, nil
|
|
}
|
|
|
|
// DisconnectedGracefully marks the peer id's stream status as disconnected gracefully.
|
|
func (t *Tracker) DisconnectedGracefully(id string) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
|
|
if status, ok := t.streams[id]; ok {
|
|
status.TrackDisconnectedGracefully()
|
|
}
|
|
}
|
|
|
|
// DisconnectedDueToError marks the peer id's stream status as disconnected due to an error.
|
|
func (t *Tracker) DisconnectedDueToError(id string, error string) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
|
|
if status, ok := t.streams[id]; ok {
|
|
status.TrackDisconnectedDueToError(error)
|
|
}
|
|
}
|
|
|
|
func (t *Tracker) StreamStatus(id string) (resp Status, found bool) {
|
|
t.mu.RLock()
|
|
defer t.mu.RUnlock()
|
|
|
|
s, ok := t.streams[id]
|
|
if !ok {
|
|
return Status{
|
|
NeverConnected: true,
|
|
}, false
|
|
}
|
|
return s.GetStatus(), true
|
|
}
|
|
|
|
func (t *Tracker) ConnectedStreams() map[string]chan struct{} {
|
|
t.mu.RLock()
|
|
defer t.mu.RUnlock()
|
|
|
|
resp := make(map[string]chan struct{})
|
|
for peer, status := range t.streams {
|
|
if status.IsConnected() {
|
|
resp[peer] = status.doneCh
|
|
}
|
|
}
|
|
return resp
|
|
}
|
|
|
|
func (t *Tracker) DeleteStatus(id string) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
|
|
delete(t.streams, id)
|
|
}
|
|
|
|
// IsHealthy is a calculates the health of a peering status.
|
|
// We define a peering as unhealthy if its status has been in the following
|
|
// states for longer than the configured incomingHeartbeatTimeout.
|
|
// - If it is disconnected
|
|
// - If the last received Nack is newer than last received Ack
|
|
// - If the last received error is newer than last received success
|
|
//
|
|
// If none of these conditions apply, we call the peering healthy.
|
|
func (t *Tracker) IsHealthy(s Status) bool {
|
|
// If stream is in a disconnected state for longer than the configured
|
|
// heartbeat timeout, report as unhealthy.
|
|
if s.DisconnectTime != nil &&
|
|
t.timeNow().Sub(*s.DisconnectTime) > t.heartbeatTimeout {
|
|
return false
|
|
}
|
|
|
|
// If last Nack is after last Ack, it means the peer is unable to
|
|
// handle our replication message
|
|
if s.LastAck == nil {
|
|
s.LastAck = &time.Time{}
|
|
}
|
|
|
|
if s.LastNack != nil &&
|
|
s.LastNack.After(*s.LastAck) &&
|
|
t.timeNow().Sub(*s.LastAck) > t.heartbeatTimeout {
|
|
return false
|
|
}
|
|
|
|
// If last recv error is newer than last recv success, we were unable
|
|
// to handle the peer's replication message.
|
|
if s.LastRecvResourceSuccess == nil {
|
|
s.LastRecvResourceSuccess = &time.Time{}
|
|
}
|
|
|
|
if s.LastRecvError != nil &&
|
|
s.LastRecvError.After(*s.LastRecvResourceSuccess) &&
|
|
t.timeNow().Sub(*s.LastRecvError) > t.heartbeatTimeout {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
type MutableStatus struct {
|
|
mu sync.RWMutex
|
|
|
|
// timeNow is a shim for testing.
|
|
timeNow func() time.Time
|
|
|
|
// doneCh allows for shutting down a stream gracefully by sending a termination message
|
|
// to the peer before the stream's context is cancelled.
|
|
doneCh chan struct{}
|
|
|
|
Status
|
|
}
|
|
|
|
// Status contains information about the replication stream to a peer cluster.
|
|
// TODO(peering): There's a lot of fields here...
|
|
type Status struct {
|
|
// Connected is true when there is an open stream for the peer.
|
|
Connected bool
|
|
|
|
// NeverConnected is true for peerings that have never connected, false otherwise.
|
|
NeverConnected bool
|
|
|
|
// DisconnectErrorMessage tracks the error that caused the stream to disconnect non-gracefully.
|
|
// If the stream is connected or it disconnected gracefully it will be empty.
|
|
DisconnectErrorMessage string
|
|
|
|
// If the status is not connected, DisconnectTime tracks when the stream was closed. Else it's zero.
|
|
DisconnectTime *time.Time
|
|
|
|
// LastAck tracks the time we received the last ACK for a resource replicated TO the peer.
|
|
LastAck *time.Time
|
|
|
|
// LastNack tracks the time we received the last NACK for a resource replicated to the peer.
|
|
LastNack *time.Time
|
|
|
|
// LastNackMessage tracks the reported error message associated with the last NACK from a peer.
|
|
LastNackMessage string
|
|
|
|
// LastSendError tracks the time of the last error sending into the stream.
|
|
LastSendError *time.Time
|
|
|
|
// LastSendErrorMessage tracks the last error message when sending into the stream.
|
|
LastSendErrorMessage string
|
|
|
|
// LastSendSuccess tracks the time we last successfully sent a resource TO the peer.
|
|
LastSendSuccess *time.Time
|
|
|
|
// LastRecvHeartbeat tracks when we last received a heartbeat from our peer.
|
|
LastRecvHeartbeat *time.Time
|
|
|
|
// LastRecvResourceSuccess tracks the time we last successfully stored a resource replicated FROM the peer.
|
|
LastRecvResourceSuccess *time.Time
|
|
|
|
// LastRecvError tracks either:
|
|
// - The time we failed to store a resource replicated FROM the peer.
|
|
// - The time of the last error when receiving from the stream.
|
|
LastRecvError *time.Time
|
|
|
|
// LastRecvErrorMessage tracks the last error message when receiving from the stream.
|
|
LastRecvErrorMessage string
|
|
|
|
// TODO(peering): consider keeping track of imported and exported services thru raft
|
|
// ImportedServices keeps track of which service names are imported for the peer
|
|
ImportedServices []string
|
|
// ExportedServices keeps track of which service names a peer asks to export
|
|
ExportedServices []string
|
|
}
|
|
|
|
func (s *Status) GetImportedServicesCount() uint64 {
|
|
return uint64(len(s.ImportedServices))
|
|
}
|
|
|
|
func (s *Status) GetExportedServicesCount() uint64 {
|
|
return uint64(len(s.ExportedServices))
|
|
}
|
|
|
|
func newMutableStatus(now func() time.Time, connected bool) *MutableStatus {
|
|
return &MutableStatus{
|
|
Status: Status{
|
|
Connected: connected,
|
|
NeverConnected: !connected,
|
|
},
|
|
timeNow: now,
|
|
doneCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
func (s *MutableStatus) Done() <-chan struct{} {
|
|
return s.doneCh
|
|
}
|
|
|
|
func (s *MutableStatus) TrackAck() {
|
|
s.mu.Lock()
|
|
s.LastAck = ptr(s.timeNow().UTC())
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
func (s *MutableStatus) TrackSendError(error string) {
|
|
s.mu.Lock()
|
|
s.LastSendError = ptr(s.timeNow().UTC())
|
|
s.LastSendErrorMessage = error
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
func (s *MutableStatus) TrackSendSuccess() {
|
|
s.mu.Lock()
|
|
s.LastSendSuccess = ptr(s.timeNow().UTC())
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
// TrackRecvResourceSuccess tracks receiving a replicated resource.
|
|
func (s *MutableStatus) TrackRecvResourceSuccess() {
|
|
s.mu.Lock()
|
|
s.LastRecvResourceSuccess = ptr(s.timeNow().UTC())
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
// TrackRecvHeartbeat tracks receiving a heartbeat from our peer.
|
|
func (s *MutableStatus) TrackRecvHeartbeat() {
|
|
s.mu.Lock()
|
|
s.LastRecvHeartbeat = ptr(s.timeNow().UTC())
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
func (s *MutableStatus) TrackRecvError(error string) {
|
|
s.mu.Lock()
|
|
s.LastRecvError = ptr(s.timeNow().UTC())
|
|
s.LastRecvErrorMessage = error
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
func (s *MutableStatus) TrackNack(msg string) {
|
|
s.mu.Lock()
|
|
s.LastNack = ptr(s.timeNow().UTC())
|
|
s.LastNackMessage = msg
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
func (s *MutableStatus) TrackConnected() {
|
|
s.mu.Lock()
|
|
s.Connected = true
|
|
s.DisconnectTime = &time.Time{}
|
|
s.DisconnectErrorMessage = ""
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
// TrackDisconnectedGracefully tracks when the stream was disconnected in a way we expected.
|
|
// For example, we got a terminated message, or we terminated the stream ourselves.
|
|
func (s *MutableStatus) TrackDisconnectedGracefully() {
|
|
s.mu.Lock()
|
|
s.Connected = false
|
|
s.DisconnectTime = ptr(s.timeNow().UTC())
|
|
s.DisconnectErrorMessage = ""
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
// TrackDisconnectedDueToError tracks when the stream was disconnected due to an error.
|
|
// For example the heartbeat timed out, or we couldn't send into the stream.
|
|
func (s *MutableStatus) TrackDisconnectedDueToError(error string) {
|
|
s.mu.Lock()
|
|
s.Connected = false
|
|
s.DisconnectTime = ptr(s.timeNow().UTC())
|
|
s.DisconnectErrorMessage = error
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
func (s *MutableStatus) IsConnected() bool {
|
|
var resp bool
|
|
|
|
s.mu.RLock()
|
|
resp = s.Connected
|
|
s.mu.RUnlock()
|
|
|
|
return resp
|
|
}
|
|
|
|
func (s *MutableStatus) GetStatus() Status {
|
|
s.mu.RLock()
|
|
copy := s.Status
|
|
s.mu.RUnlock()
|
|
|
|
return copy
|
|
}
|
|
|
|
func (s *MutableStatus) SetImportedServices(serviceNames []structs.ServiceName) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
s.ImportedServices = make([]string, len(serviceNames))
|
|
for i, sn := range serviceNames {
|
|
s.ImportedServices[i] = sn.String()
|
|
}
|
|
}
|
|
|
|
func (s *MutableStatus) GetImportedServicesCount() int {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
|
|
return len(s.ImportedServices)
|
|
}
|
|
|
|
func (s *MutableStatus) SetExportedServices(serviceNames []structs.ServiceName) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
s.ExportedServices = make([]string, len(serviceNames))
|
|
|
|
for i, sn := range serviceNames {
|
|
s.ExportedServices[i] = sn.String()
|
|
}
|
|
}
|
|
|
|
func (s *MutableStatus) GetExportedServicesCount() int {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
|
|
return len(s.ExportedServices)
|
|
}
|
|
|
|
func ptr[T any](x T) *T {
|
|
return &x
|
|
}
|