mirror of
https://github.com/status-im/consul.git
synced 2025-01-14 15:54:40 +00:00
3b9bb8d6f9
* Check for ACL write permissions on write Link eventually will be creating a token, so require acl:write. * Convert Run to Start, only allow to start once * Always initialize HCP components at startup * Support for updating config and client * Pass HCP manager to controller * Start HCP manager in link resource Start as part of link creation rather than always starting. Update the HCP manager with values from the link before starting as well. * Fix metrics sink leaked goroutine * Remove the hardcoded disabled hostname prefix The HCP metrics sink will always be enabled, so the length of sinks will always be greater than zero. This also means that we will also always default to prefixing metrics with the hostname, which is what our documentation states is the expected behavior anyway. * Add changelog * Check and set running status in one method * Check for primary datacenter, add back test * Clarify merge reasoning, fix timing issue in test * Add comment about controller placement * Expand on breaking change, fix typo in changelog
304 lines
7.2 KiB
Go
304 lines
7.2 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package hcp
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
hcpclient "github.com/hashicorp/consul/agent/hcp/client"
|
|
"github.com/hashicorp/consul/agent/hcp/config"
|
|
"github.com/hashicorp/consul/agent/hcp/scada"
|
|
"github.com/hashicorp/consul/lib"
|
|
"github.com/hashicorp/go-hclog"
|
|
)
|
|
|
|
var (
|
|
defaultManagerMinInterval = 45 * time.Minute
|
|
defaultManagerMaxInterval = 75 * time.Minute
|
|
)
|
|
|
|
type ManagerConfig struct {
|
|
Client hcpclient.Client
|
|
CloudConfig config.CloudConfig
|
|
SCADAProvider scada.Provider
|
|
TelemetryProvider *hcpProviderImpl
|
|
|
|
StatusFn StatusCallback
|
|
// Idempotent function to upsert the HCP management token. This will be called periodically in
|
|
// the manager's main loop.
|
|
ManagementTokenUpserterFn ManagementTokenUpserter
|
|
MinInterval time.Duration
|
|
MaxInterval time.Duration
|
|
|
|
Logger hclog.Logger
|
|
}
|
|
|
|
func (cfg *ManagerConfig) enabled() bool {
|
|
return cfg.Client != nil && cfg.StatusFn != nil
|
|
}
|
|
|
|
func (cfg *ManagerConfig) nextHeartbeat() time.Duration {
|
|
min := cfg.MinInterval
|
|
if min == 0 {
|
|
min = defaultManagerMinInterval
|
|
}
|
|
|
|
max := cfg.MaxInterval
|
|
if max == 0 {
|
|
max = defaultManagerMaxInterval
|
|
}
|
|
if max < min {
|
|
max = min
|
|
}
|
|
return min + lib.RandomStagger(max-min)
|
|
}
|
|
|
|
type StatusCallback func(context.Context) (hcpclient.ServerStatus, error)
|
|
type ManagementTokenUpserter func(name, secretId string) error
|
|
|
|
type Manager struct {
|
|
logger hclog.Logger
|
|
|
|
running bool
|
|
runLock sync.RWMutex
|
|
|
|
cfg ManagerConfig
|
|
cfgMu sync.RWMutex
|
|
|
|
updateCh chan struct{}
|
|
|
|
// testUpdateSent is set by unit tests to signal when the manager's status update has triggered
|
|
testUpdateSent chan struct{}
|
|
}
|
|
|
|
// NewManager returns a Manager initialized with the given configuration.
|
|
func NewManager(cfg ManagerConfig) *Manager {
|
|
return &Manager{
|
|
logger: cfg.Logger,
|
|
cfg: cfg,
|
|
|
|
updateCh: make(chan struct{}, 1),
|
|
}
|
|
}
|
|
|
|
// Start executes the logic for connecting to HCP and sending periodic server updates. If the
|
|
// manager has been previously started, it will not start again.
|
|
func (m *Manager) Start(ctx context.Context) error {
|
|
// Check if the manager has already started
|
|
changed := m.setRunning(true)
|
|
if !changed {
|
|
m.logger.Trace("HCP manager already started")
|
|
return nil
|
|
}
|
|
|
|
var err error
|
|
m.logger.Info("HCP manager starting")
|
|
|
|
// Update and start the SCADA provider
|
|
err = m.startSCADAProvider()
|
|
if err != nil {
|
|
m.logger.Error("failed to start scada provider", "error", err)
|
|
m.setRunning(false)
|
|
return err
|
|
}
|
|
|
|
// Update and start the telemetry provider to enable the HCP metrics sink
|
|
if err := m.startTelemetryProvider(ctx); err != nil {
|
|
m.logger.Error("failed to update telemetry config provider", "error", err)
|
|
m.setRunning(false)
|
|
return err
|
|
}
|
|
|
|
// immediately send initial update
|
|
select {
|
|
case <-ctx.Done():
|
|
m.setRunning(false)
|
|
return nil
|
|
case <-m.updateCh: // empty the update chan if there is a queued update to prevent repeated update in main loop
|
|
err = m.sendUpdate()
|
|
if err != nil {
|
|
m.setRunning(false)
|
|
return err
|
|
}
|
|
default:
|
|
err = m.sendUpdate()
|
|
if err != nil {
|
|
m.setRunning(false)
|
|
return err
|
|
}
|
|
}
|
|
|
|
// main loop
|
|
go func() {
|
|
for {
|
|
m.cfgMu.RLock()
|
|
cfg := m.cfg
|
|
m.cfgMu.RUnlock()
|
|
|
|
// Check for configured management token from HCP and upsert it if found
|
|
if hcpManagement := cfg.CloudConfig.ManagementToken; len(hcpManagement) > 0 {
|
|
if cfg.ManagementTokenUpserterFn != nil {
|
|
upsertTokenErr := cfg.ManagementTokenUpserterFn("HCP Management Token", hcpManagement)
|
|
if upsertTokenErr != nil {
|
|
m.logger.Error("failed to upsert HCP management token", "err", upsertTokenErr)
|
|
}
|
|
}
|
|
}
|
|
|
|
nextUpdate := cfg.nextHeartbeat()
|
|
if err != nil {
|
|
m.logger.Error("failed to send server status to HCP", "err", err, "next_heartbeat", nextUpdate.String())
|
|
}
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
m.setRunning(false)
|
|
return
|
|
|
|
case <-m.updateCh:
|
|
err = m.sendUpdate()
|
|
|
|
case <-time.After(nextUpdate):
|
|
err = m.sendUpdate()
|
|
}
|
|
}
|
|
}()
|
|
|
|
return err
|
|
}
|
|
|
|
func (m *Manager) startSCADAProvider() error {
|
|
provider := m.cfg.SCADAProvider
|
|
if provider == nil {
|
|
return nil
|
|
}
|
|
|
|
// Update the SCADA provider configuration with HCP configurations
|
|
m.logger.Debug("updating scada provider with HCP configuration")
|
|
err := provider.UpdateHCPConfig(m.cfg.CloudConfig)
|
|
if err != nil {
|
|
m.logger.Error("failed to update scada provider with HCP configuration", "err", err)
|
|
return err
|
|
}
|
|
|
|
// Update the SCADA provider metadata
|
|
provider.UpdateMeta(map[string]string{
|
|
"consul_server_id": string(m.cfg.CloudConfig.NodeID),
|
|
})
|
|
|
|
// Start the SCADA provider
|
|
err = provider.Start()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (m *Manager) startTelemetryProvider(ctx context.Context) error {
|
|
if m.cfg.TelemetryProvider == nil {
|
|
return nil
|
|
}
|
|
|
|
m.cfg.TelemetryProvider.Run(ctx, &HCPProviderCfg{
|
|
HCPClient: m.cfg.Client,
|
|
HCPConfig: &m.cfg.CloudConfig,
|
|
})
|
|
|
|
return nil
|
|
}
|
|
|
|
func (m *Manager) GetCloudConfig() config.CloudConfig {
|
|
m.cfgMu.RLock()
|
|
defer m.cfgMu.RUnlock()
|
|
|
|
return m.cfg.CloudConfig
|
|
}
|
|
|
|
func (m *Manager) UpdateConfig(client hcpclient.Client, cloudCfg config.CloudConfig) {
|
|
m.cfgMu.Lock()
|
|
// Save original values
|
|
originalCfg := m.cfg.CloudConfig
|
|
originalClient := m.cfg.Client
|
|
|
|
// Update with new values
|
|
m.cfg.Client = client
|
|
m.cfg.CloudConfig = cloudCfg
|
|
m.cfgMu.Unlock()
|
|
|
|
// Send update if already running and values were updated
|
|
if m.isRunning() && (originalClient != client || originalCfg != cloudCfg) {
|
|
m.SendUpdate()
|
|
}
|
|
}
|
|
|
|
func (m *Manager) SendUpdate() {
|
|
m.logger.Debug("HCP triggering status update")
|
|
select {
|
|
case m.updateCh <- struct{}{}:
|
|
// trigger update
|
|
default:
|
|
// if chan is full then there is already an update triggered that will soon
|
|
// be acted on so don't bother blocking.
|
|
}
|
|
}
|
|
|
|
// TODO: we should have retried on failures here with backoff but take into
|
|
// account that if a new update is triggered while we are still retrying we
|
|
// should not start another retry loop. Something like have a "dirty" flag which
|
|
// we mark on first PushUpdate and then a retry timer as well as the interval
|
|
// and a "isRetrying" state or something so that we attempt to send update, but
|
|
// then fetch fresh info on each attempt to send so if we are already in a retry
|
|
// backoff a new push is a no-op.
|
|
func (m *Manager) sendUpdate() error {
|
|
m.cfgMu.RLock()
|
|
cfg := m.cfg
|
|
m.cfgMu.RUnlock()
|
|
|
|
if !cfg.enabled() {
|
|
return nil
|
|
}
|
|
|
|
if m.testUpdateSent != nil {
|
|
defer func() {
|
|
select {
|
|
case m.testUpdateSent <- struct{}{}:
|
|
default:
|
|
}
|
|
}()
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
|
|
s, err := cfg.StatusFn(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return cfg.Client.PushServerStatus(ctx, &s)
|
|
}
|
|
|
|
func (m *Manager) isRunning() bool {
|
|
m.runLock.RLock()
|
|
defer m.runLock.RUnlock()
|
|
return m.running
|
|
}
|
|
|
|
// setRunning sets the running status of the manager to the given value. If the
|
|
// given value is the same as the current running status, it returns false. If
|
|
// current status is updated to the given status, it returns true.
|
|
func (m *Manager) setRunning(r bool) bool {
|
|
m.runLock.Lock()
|
|
defer m.runLock.Unlock()
|
|
|
|
if m.running == r {
|
|
return false
|
|
}
|
|
|
|
m.running = r
|
|
return true
|
|
}
|