Derek Menteer 50ef6a697e
Fix issue with peer stream node cleanup. (#17235)
Fix issue with peer stream node cleanup.

This commit encompasses a few problems that are closely related due to their
proximity in the code.

1. The peerstream utilizes node IDs in several locations to determine which
nodes / services / checks should be cleaned up or created. While VM deployments
with agents will likely always have a node ID, agentless uses synthetic nodes
and does not populate the field. This means that for consul-k8s deployments, all
services were likely bundled together into the same synthetic node in some code
paths (but not all), resulting in strange behavior. The Node.Node field should
be used instead as a unique identifier, as it should always be populated.

2. The peerstream cleanup process for unused nodes uses an incorrect query for
node deregistration. This query is NOT namespace aware and results in the node
(and corresponding services) being deregistered prematurely whenever it has zero
default-namespace services and 1+ non-default-namespace services registered on
it. This issue is tricky to find due to the incorrect logic mentioned in #1,
combined with the fact that the affected services must be co-located on the same
node as the currently deregistering service for this to be encountered.

3. The stream tracker did not understand differences between services in
different namespaces and could therefore report incorrect numbers. It was
updated to utilize the full service name to avoid conflicts and return proper
results.
2023-05-08 13:13:25 -05:00

132 lines
5.1 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package peerstream
import (
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-memdb"
"google.golang.org/grpc"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/acl/resolver"
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/consul/stream"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/proto/private/pbpeering"
"github.com/hashicorp/consul/proto/private/pbpeerstream"
)
// TODO(peering): fix up these interfaces to be more testable now that they are
// extracted from private peering
const (
defaultOutgoingHeartbeatInterval = 15 * time.Second
defaultIncomingHeartbeatTimeout = 2 * time.Minute
)
type Server struct {
Config
Tracker *Tracker
}
type Config struct {
Backend Backend
GetStore func() StateStore
Logger hclog.Logger
ForwardRPC func(structs.RPCInfo, func(*grpc.ClientConn) error) (bool, error)
ACLResolver ACLResolver
// Datacenter of the Consul server this gRPC server is hosted on
Datacenter string
ConnectEnabled bool
// outgoingHeartbeatInterval is how often we send a heartbeat.
outgoingHeartbeatInterval time.Duration
// incomingHeartbeatTimeout is how long we'll wait between receiving heartbeats before we close the connection.
incomingHeartbeatTimeout time.Duration
}
//go:generate mockery --name ACLResolver --inpackage
type ACLResolver interface {
ResolveTokenAndDefaultMeta(string, *acl.EnterpriseMeta, *acl.AuthorizerContext) (resolver.Result, error)
}
func NewServer(cfg Config) *Server {
requireNotNil(cfg.Backend, "Backend")
requireNotNil(cfg.GetStore, "GetStore")
requireNotNil(cfg.Logger, "Logger")
// requireNotNil(cfg.ACLResolver, "ACLResolver") // TODO(peering): reenable check when ACLs are required
if cfg.Datacenter == "" {
panic("Datacenter is required")
}
if cfg.outgoingHeartbeatInterval == 0 {
cfg.outgoingHeartbeatInterval = defaultOutgoingHeartbeatInterval
}
if cfg.incomingHeartbeatTimeout == 0 {
cfg.incomingHeartbeatTimeout = defaultIncomingHeartbeatTimeout
}
return &Server{
Config: cfg,
Tracker: NewTracker(cfg.incomingHeartbeatTimeout),
}
}
func requireNotNil(v interface{}, name string) {
if v == nil {
panic(name + " is required")
}
}
var _ pbpeerstream.PeerStreamServiceServer = (*Server)(nil)
func (s *Server) Register(grpcServer *grpc.Server) {
pbpeerstream.RegisterPeerStreamServiceServer(grpcServer, s)
}
type Backend interface {
Subscribe(req *stream.SubscribeRequest) (*stream.Subscription, error)
// IsLeader indicates whether the consul server is in a leader state or not.
IsLeader() bool
// SetLeaderAddress is called on a raft.LeaderObservation in a go routine
// in the consul server; see trackLeaderChanges()
SetLeaderAddress(string)
// GetLeaderAddress provides the best hint for the current address of the
// leader. There is no guarantee that this is the actual address of the
// leader.
GetLeaderAddress() string
ValidateProposedPeeringSecret(id string) (bool, error)
PeeringSecretsWrite(req *pbpeering.SecretsWriteRequest) error
PeeringTerminateByID(req *pbpeering.PeeringTerminateByIDRequest) error
PeeringTrustBundleWrite(req *pbpeering.PeeringTrustBundleWriteRequest) error
CatalogRegister(req *structs.RegisterRequest) error
CatalogDeregister(req *structs.DeregisterRequest) error
PeeringWrite(req *pbpeering.PeeringWriteRequest) error
}
// StateStore provides a read-only interface for querying Peering data.
type StateStore interface {
PeeringRead(ws memdb.WatchSet, q state.Query) (uint64, *pbpeering.Peering, error)
PeeringReadByID(ws memdb.WatchSet, id string) (uint64, *pbpeering.Peering, error)
PeeringList(ws memdb.WatchSet, entMeta acl.EnterpriseMeta) (uint64, []*pbpeering.Peering, error)
PeeringTrustBundleRead(ws memdb.WatchSet, q state.Query) (uint64, *pbpeering.PeeringTrustBundle, error)
PeeringTrustBundleList(ws memdb.WatchSet, entMeta acl.EnterpriseMeta) (uint64, []*pbpeering.PeeringTrustBundle, error)
PeeringSecretsRead(ws memdb.WatchSet, peerID string) (*pbpeering.PeeringSecrets, error)
ExportedServicesForPeer(ws memdb.WatchSet, peerID, dc string) (uint64, *structs.ExportedServiceList, error)
ServiceDump(ws memdb.WatchSet, kind structs.ServiceKind, useKind bool, entMeta *acl.EnterpriseMeta, peerName string) (uint64, structs.CheckServiceNodes, error)
CheckServiceNodes(ws memdb.WatchSet, serviceName string, entMeta *acl.EnterpriseMeta, peerName string) (uint64, structs.CheckServiceNodes, error)
NodeServiceList(ws memdb.WatchSet, nodeNameOrID string, entMeta *acl.EnterpriseMeta, peerName string) (uint64, *structs.NodeServiceList, error)
CAConfig(ws memdb.WatchSet) (uint64, *structs.CAConfiguration, error)
TrustBundleListByService(ws memdb.WatchSet, service, dc string, entMeta acl.EnterpriseMeta) (uint64, []*pbpeering.PeeringTrustBundle, error)
ServiceList(ws memdb.WatchSet, entMeta *acl.EnterpriseMeta, peerName string) (uint64, structs.ServiceList, error)
ConfigEntry(ws memdb.WatchSet, kind, name string, entMeta *acl.EnterpriseMeta) (uint64, structs.ConfigEntry, error)
AbandonCh() <-chan struct{}
}