mirror of
https://github.com/status-im/consul.git
synced 2025-01-23 03:59:18 +00:00
5fb6ab6a3a
* Add function to get update channel for watching HCP Link * Add MonitorHCPLink function This function can be called in a goroutine to manage the lifecycle of the HCP manager. * Update HCP Manager config in link monitor before starting This updates HCPMonitorLink so it updates the HCP manager with an HCP client and management token when a Link is upserted. * Let MonitorHCPManager handle lifecycle instead of link controller * Remove cleanup from Link controller and move it to MonitorHCPLink Previously, the Link Controller was responsible for cleaning up the HCP-related files on the file system. This change makes it so MonitorHCPLink handles this cleanup. As a result, we are able to remove the PlacementEachServer placement strategy for the Link controller because it no longer needs to do this per-node cleanup. * Remove HCP Manager dependency from Link Controller The Link controller does not need to have HCP Manager as a dependency anymore, so this removes that dependency in order to simplify the design. * Add Linked prefix to Linked status variables This is in preparation for adding a new status type to the Link resource. * Add new "validated" status type to link resource The link resource controller will now set a "validated" status in addition to the "linked" status. This is needed so that other components (eg the HCP manager) know when the Link is ready to link with HCP. * Fix tests * Handle new 'EndOfSnapshot' WatchList event * Fix watch test * Remove unnecessary config from TestAgent_scadaProvider Since the Scada provider is now started on agent startup regardless of whether a cloud config is provided, this removes the cloud config override from the relevant test. This change is not exactly related to the changes from this PR, but rather is something small and sort of related that was noticed while working on this PR. * Simplify link watch test and remove sleep from link watch This updates the link watch test so that it uses more mocks and does not require setting up the infrastructure for the HCP Link controller. This also removes the time.Sleep delay in the link watcher loop in favor of an error counter. When we receive 10 consecutive errors, we shut down the link watcher loop. * Add better logging for link validation. Remove EndOfSnapshot test. * Refactor link monitor test into a table test * Add some clarifying comments to link monitor * Simplify link watch test * Test a bunch more errors cases in link monitor test * Use exponential backoff instead of errorCounter in LinkWatch * Move link watch and link monitor into a single goroutine called from server.go * Refactor HCP link watcher to use single go-routine. Previously, if the WatchClient errored, we would've never recovered because we never retry to create the stream. With this change, we have a single goroutine that runs for the life of the server agent and if the WatchClient stream ever errors, we retry the creation of the stream with an exponential backoff.
482 lines
15 KiB
Go
482 lines
15 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
// Package bootstrap handles bootstrapping an agent's config from HCP.
|
|
package bootstrap
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"crypto/tls"
|
|
"crypto/x509"
|
|
"encoding/json"
|
|
"encoding/pem"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/go-uuid"
|
|
|
|
"github.com/hashicorp/consul/agent/connect"
|
|
"github.com/hashicorp/consul/agent/hcp/bootstrap/constants"
|
|
hcpclient "github.com/hashicorp/consul/agent/hcp/client"
|
|
"github.com/hashicorp/consul/lib"
|
|
"github.com/hashicorp/consul/lib/retry"
|
|
)
|
|
|
|
const (
|
|
CAFileName = "server-tls-cas.pem"
|
|
CertFileName = "server-tls-cert.pem"
|
|
ConfigFileName = "server-config.json"
|
|
KeyFileName = "server-tls-key.pem"
|
|
TokenFileName = "hcp-management-token"
|
|
SuccessFileName = "successful-bootstrap"
|
|
)
|
|
|
|
// UI is a shim to allow the agent command to pass in it's mitchelh/cli.UI so we
|
|
// can output useful messages to the user during bootstrapping. For example if
|
|
// we have to retry several times to bootstrap we don't want the agent to just
|
|
// stall with no output which is the case if we just returned all intermediate
|
|
// warnings or errors.
|
|
type UI interface {
|
|
Output(string)
|
|
Warn(string)
|
|
Info(string)
|
|
Error(string)
|
|
}
|
|
|
|
// RawBootstrapConfig contains the Consul config as a raw JSON string and the management token
|
|
// which either was retrieved from persisted files or from the bootstrap endpoint
|
|
type RawBootstrapConfig struct {
|
|
ConfigJSON string
|
|
ManagementToken string
|
|
}
|
|
|
|
// FetchBootstrapConfig will fetch bootstrap configuration from remote servers and persist it to disk.
|
|
// It will retry until successful or a terminal error condition is found (e.g. permission denied).
|
|
func FetchBootstrapConfig(ctx context.Context, client hcpclient.Client, dataDir string, ui UI) (*RawBootstrapConfig, error) {
|
|
w := retry.Waiter{
|
|
MinWait: 1 * time.Second,
|
|
MaxWait: 5 * time.Minute,
|
|
Jitter: retry.NewJitter(50),
|
|
}
|
|
|
|
for {
|
|
// Note we don't want to shadow `ctx` here since we need that for the Wait
|
|
// below.
|
|
reqCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
defer cancel()
|
|
|
|
cfg, err := fetchBootstrapConfig(reqCtx, client, dataDir)
|
|
if err != nil {
|
|
if errors.Is(err, hcpclient.ErrUnauthorized) || errors.Is(err, hcpclient.ErrForbidden) {
|
|
// Don't retry on terminal errors
|
|
return nil, err
|
|
}
|
|
ui.Error(fmt.Sprintf("Error: failed to fetch bootstrap config from HCP, will retry in %s: %s",
|
|
w.NextWait().Round(time.Second), err))
|
|
if err := w.Wait(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
// Finished waiting, restart loop
|
|
continue
|
|
}
|
|
return cfg, nil
|
|
}
|
|
}
|
|
|
|
// fetchBootstrapConfig will fetch the bootstrap configuration from remote servers and persist it to disk.
|
|
func fetchBootstrapConfig(ctx context.Context, client hcpclient.Client, dataDir string) (*RawBootstrapConfig, error) {
|
|
reqCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
defer cancel()
|
|
|
|
resp, err := client.FetchBootstrap(reqCtx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch bootstrap config from HCP: %w", err)
|
|
}
|
|
|
|
bsCfg := resp
|
|
devMode := dataDir == ""
|
|
cfgJSON, err := persistAndProcessConfig(dataDir, devMode, bsCfg)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to persist config for existing cluster: %w", err)
|
|
}
|
|
|
|
return &RawBootstrapConfig{
|
|
ConfigJSON: cfgJSON,
|
|
ManagementToken: bsCfg.ManagementToken,
|
|
}, nil
|
|
}
|
|
|
|
// persistAndProcessConfig is called when we receive data from CCM.
|
|
// We validate and persist everything that was received, then also update
|
|
// the JSON config as needed.
|
|
func persistAndProcessConfig(dataDir string, devMode bool, bsCfg *hcpclient.BootstrapConfig) (string, error) {
|
|
if devMode {
|
|
// Agent in dev mode, we still need somewhere to persist the certs
|
|
// temporarily though to be able to start up at all since we don't support
|
|
// inline certs right now. Use temp dir
|
|
tmp, err := os.MkdirTemp(os.TempDir(), "consul-dev-")
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to create temp dir for certificates: %w", err)
|
|
}
|
|
dataDir = tmp
|
|
}
|
|
|
|
// Create subdir if it's not already there.
|
|
dir := filepath.Join(dataDir, constants.SubDir)
|
|
if err := lib.EnsurePath(dir, true); err != nil {
|
|
return "", fmt.Errorf("failed to ensure directory %q: %w", dir, err)
|
|
}
|
|
|
|
// Parse just to a map for now as we only have to inject to a specific place
|
|
// and parsing whole Config struct is complicated...
|
|
var cfg map[string]any
|
|
|
|
if err := json.Unmarshal([]byte(bsCfg.ConsulConfig), &cfg); err != nil {
|
|
return "", fmt.Errorf("failed to unmarshal bootstrap config: %w", err)
|
|
}
|
|
|
|
// Avoid ever setting an initial_management token from HCP now that we can
|
|
// separately bootstrap an HCP management token with a distinct accessor ID.
|
|
//
|
|
// CCM will continue to return an initial_management token because previous versions of Consul
|
|
// cannot bootstrap an HCP management token distinct from the initial management token.
|
|
// This block can be deleted once CCM supports tailoring bootstrap config responses
|
|
// based on the version of Consul that requested it.
|
|
acls, aclsOK := cfg["acl"].(map[string]any)
|
|
if aclsOK {
|
|
tokens, tokensOK := acls["tokens"].(map[string]interface{})
|
|
if tokensOK {
|
|
delete(tokens, "initial_management")
|
|
}
|
|
}
|
|
|
|
var cfgJSON string
|
|
if bsCfg.TLSCert != "" {
|
|
if err := ValidateTLSCerts(bsCfg.TLSCert, bsCfg.TLSCertKey, bsCfg.TLSCAs); err != nil {
|
|
return "", fmt.Errorf("invalid certificates: %w", err)
|
|
}
|
|
|
|
// Persist the TLS cert files from the response since we need to refer to them
|
|
// as disk files either way.
|
|
if err := persistTLSCerts(dir, bsCfg.TLSCert, bsCfg.TLSCertKey, bsCfg.TLSCAs); err != nil {
|
|
return "", fmt.Errorf("failed to persist TLS certificates to dir %q: %w", dataDir, err)
|
|
}
|
|
|
|
// Store paths to the persisted TLS cert files.
|
|
cfg["ca_file"] = filepath.Join(dir, CAFileName)
|
|
cfg["cert_file"] = filepath.Join(dir, CertFileName)
|
|
cfg["key_file"] = filepath.Join(dir, KeyFileName)
|
|
|
|
// Convert the bootstrap config map back into a string
|
|
cfgJSONBytes, err := json.Marshal(cfg)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
cfgJSON = string(cfgJSONBytes)
|
|
}
|
|
|
|
if !devMode {
|
|
// Persist the final config we need to add so that it is available locally after a restart.
|
|
// Assuming the configured data dir wasn't a tmp dir to start with.
|
|
if err := persistBootstrapConfig(dir, cfgJSON); err != nil {
|
|
return "", fmt.Errorf("failed to persist bootstrap config: %w", err)
|
|
}
|
|
|
|
// HCP only returns the management token if it requires Consul to
|
|
// initialize it
|
|
if bsCfg.ManagementToken != "" {
|
|
if err := validateManagementToken(bsCfg.ManagementToken); err != nil {
|
|
return "", fmt.Errorf("invalid management token: %w", err)
|
|
}
|
|
if err := persistManagementToken(dir, bsCfg.ManagementToken); err != nil {
|
|
return "", fmt.Errorf("failed to persist HCP management token: %w", err)
|
|
}
|
|
}
|
|
|
|
if err := persistSuccessMarker(dir); err != nil {
|
|
return "", fmt.Errorf("failed to persist success marker: %w", err)
|
|
}
|
|
}
|
|
return cfgJSON, nil
|
|
}
|
|
|
|
func persistSuccessMarker(dir string) error {
|
|
name := filepath.Join(dir, SuccessFileName)
|
|
return os.WriteFile(name, []byte(""), 0600)
|
|
|
|
}
|
|
|
|
func persistTLSCerts(dir string, serverCert, serverKey string, caCerts []string) error {
|
|
if serverCert == "" || serverKey == "" {
|
|
return fmt.Errorf("unexpected bootstrap response from HCP: missing TLS information")
|
|
}
|
|
|
|
// Write out CA cert(s). We write them all to one file because Go's x509
|
|
// machinery will read as many certs as it finds from each PEM file provided
|
|
// and add them separaetly to the CertPool for validation
|
|
f, err := os.OpenFile(filepath.Join(dir, CAFileName), os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0600)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
bf := bufio.NewWriter(f)
|
|
for _, caPEM := range caCerts {
|
|
bf.WriteString(caPEM + "\n")
|
|
}
|
|
if err := bf.Flush(); err != nil {
|
|
return err
|
|
}
|
|
if err := f.Close(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := os.WriteFile(filepath.Join(dir, CertFileName), []byte(serverCert), 0600); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := os.WriteFile(filepath.Join(dir, KeyFileName), []byte(serverKey), 0600); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Basic validation to ensure a UUID was loaded and assumes the token is non-empty
|
|
func validateManagementToken(token string) error {
|
|
// note: we assume that the token is not an empty string
|
|
if _, err := uuid.ParseUUID(token); err != nil {
|
|
return errors.New("management token is not a valid UUID")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func persistManagementToken(dir, token string) error {
|
|
name := filepath.Join(dir, TokenFileName)
|
|
return os.WriteFile(name, []byte(token), 0600)
|
|
}
|
|
|
|
func persistBootstrapConfig(dir, cfgJSON string) error {
|
|
// Persist the important bits we got from bootstrapping. The TLS certs are
|
|
// already persisted, just need to persist the config we are going to add.
|
|
name := filepath.Join(dir, ConfigFileName)
|
|
return os.WriteFile(name, []byte(cfgJSON), 0600)
|
|
}
|
|
|
|
func LoadPersistedBootstrapConfig(dataDir string, ui UI) (*RawBootstrapConfig, bool) {
|
|
if dataDir == "" {
|
|
// There's no files to load when in dev mode.
|
|
return nil, false
|
|
}
|
|
|
|
dir := filepath.Join(dataDir, constants.SubDir)
|
|
|
|
_, err := os.Stat(filepath.Join(dir, SuccessFileName))
|
|
if os.IsNotExist(err) {
|
|
// Haven't bootstrapped from HCP.
|
|
return nil, false
|
|
}
|
|
if err != nil {
|
|
ui.Warn("failed to check for config on disk, re-fetching from HCP: " + err.Error())
|
|
return nil, false
|
|
}
|
|
|
|
if err := checkCerts(dir); err != nil {
|
|
ui.Warn("failed to validate certs on disk, re-fetching from HCP: " + err.Error())
|
|
return nil, false
|
|
}
|
|
|
|
configJSON, err := loadBootstrapConfigJSON(dataDir)
|
|
if err != nil {
|
|
ui.Warn("failed to load bootstrap config from disk, re-fetching from HCP: " + err.Error())
|
|
return nil, false
|
|
}
|
|
|
|
mgmtToken, err := loadManagementToken(dir)
|
|
if err != nil {
|
|
ui.Warn("failed to load HCP management token from disk, re-fetching from HCP: " + err.Error())
|
|
return nil, false
|
|
}
|
|
|
|
return &RawBootstrapConfig{
|
|
ConfigJSON: configJSON,
|
|
ManagementToken: mgmtToken,
|
|
}, true
|
|
}
|
|
|
|
func loadBootstrapConfigJSON(dataDir string) (string, error) {
|
|
filename := filepath.Join(dataDir, constants.SubDir, ConfigFileName)
|
|
|
|
_, err := os.Stat(filename)
|
|
if os.IsNotExist(err) {
|
|
return "", nil
|
|
}
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to check for bootstrap config: %w", err)
|
|
}
|
|
|
|
jsonBs, err := os.ReadFile(filename)
|
|
if err != nil {
|
|
return "", fmt.Errorf(fmt.Sprintf("failed to read local bootstrap config file: %s", err))
|
|
}
|
|
return strings.TrimSpace(string(jsonBs)), nil
|
|
}
|
|
|
|
func loadManagementToken(dir string) (string, error) {
|
|
name := filepath.Join(dir, TokenFileName)
|
|
bytes, err := os.ReadFile(name)
|
|
if os.IsNotExist(err) {
|
|
return "", errors.New("configuration files on disk are incomplete, missing: " + name)
|
|
}
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to read: %w", err)
|
|
}
|
|
|
|
token := string(bytes)
|
|
if err := validateManagementToken(token); err != nil {
|
|
return "", fmt.Errorf("invalid management token: %w", err)
|
|
}
|
|
|
|
return token, nil
|
|
}
|
|
|
|
func checkCerts(dir string) error {
|
|
files := []string{
|
|
filepath.Join(dir, CAFileName),
|
|
filepath.Join(dir, CertFileName),
|
|
filepath.Join(dir, KeyFileName),
|
|
}
|
|
|
|
missing := make([]string, 0)
|
|
for _, file := range files {
|
|
_, err := os.Stat(file)
|
|
if os.IsNotExist(err) {
|
|
missing = append(missing, file)
|
|
continue
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// If all the TLS files are missing, assume this is intentional.
|
|
// Existing clusters do not receive any TLS certs.
|
|
if len(missing) == len(files) {
|
|
return nil
|
|
}
|
|
|
|
// If only some of the files are missing, something went wrong.
|
|
if len(missing) > 0 {
|
|
return fmt.Errorf("configuration files on disk are incomplete, missing: %v", missing)
|
|
}
|
|
|
|
cert, key, caCerts, err := LoadCerts(dir)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to load certs from disk: %w", err)
|
|
}
|
|
|
|
if err = ValidateTLSCerts(cert, key, caCerts); err != nil {
|
|
return fmt.Errorf("invalid certs on disk: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func LoadCerts(dir string) (cert, key string, caCerts []string, err error) {
|
|
certPEMBlock, err := os.ReadFile(filepath.Join(dir, CertFileName))
|
|
if err != nil {
|
|
return "", "", nil, err
|
|
}
|
|
keyPEMBlock, err := os.ReadFile(filepath.Join(dir, KeyFileName))
|
|
if err != nil {
|
|
return "", "", nil, err
|
|
}
|
|
|
|
caPEMs, err := os.ReadFile(filepath.Join(dir, CAFileName))
|
|
if err != nil {
|
|
return "", "", nil, err
|
|
}
|
|
caCerts, err = splitCACerts(caPEMs)
|
|
if err != nil {
|
|
return "", "", nil, fmt.Errorf("failed to parse CA certs: %w", err)
|
|
}
|
|
|
|
return string(certPEMBlock), string(keyPEMBlock), caCerts, nil
|
|
}
|
|
|
|
// splitCACerts takes a list of concatenated PEM blocks and splits
|
|
// them back up into strings. This is used because CACerts are written
|
|
// into a single file, but validated individually.
|
|
func splitCACerts(caPEMs []byte) ([]string, error) {
|
|
var out []string
|
|
|
|
for {
|
|
nextBlock, remaining := pem.Decode(caPEMs)
|
|
if nextBlock == nil {
|
|
break
|
|
}
|
|
if nextBlock.Type != "CERTIFICATE" {
|
|
return nil, fmt.Errorf("PEM-block should be CERTIFICATE type")
|
|
}
|
|
|
|
// Collect up to the start of the remaining bytes.
|
|
// We don't grab nextBlock.Bytes because it's not PEM encoded.
|
|
out = append(out, string(caPEMs[:len(caPEMs)-len(remaining)]))
|
|
caPEMs = remaining
|
|
}
|
|
|
|
if len(out) == 0 {
|
|
return nil, errors.New("invalid CA certificate")
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// ValidateTLSCerts checks that the CA cert, server cert, and key on disk are structurally valid.
|
|
//
|
|
// OPTIMIZE: This could be improved by returning an error if certs are expired or close to expiration.
|
|
// However, that requires issuing new certs on bootstrap requests, since returning an error
|
|
// would trigger a re-fetch from HCP.
|
|
func ValidateTLSCerts(cert, key string, caCerts []string) error {
|
|
leaf, err := tls.X509KeyPair([]byte(cert), []byte(key))
|
|
if err != nil {
|
|
return errors.New("invalid server certificate or key")
|
|
}
|
|
_, err = x509.ParseCertificate(leaf.Certificate[0])
|
|
if err != nil {
|
|
return errors.New("invalid server certificate")
|
|
}
|
|
|
|
for _, caCert := range caCerts {
|
|
_, err = connect.ParseCert(caCert)
|
|
if err != nil {
|
|
return errors.New("invalid CA certificate")
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// LoadManagementToken returns the management token, either by loading it from the persisted
|
|
// token config file or by fetching it from HCP if the token file does not exist.
|
|
func LoadManagementToken(ctx context.Context, logger hclog.Logger, client hcpclient.Client, dataDir string) (string, error) {
|
|
hcpCfgDir := filepath.Join(dataDir, constants.SubDir)
|
|
token, err := loadManagementToken(hcpCfgDir)
|
|
|
|
if err != nil {
|
|
logger.Debug("failed to load management token from local disk, fetching configuration from HCP", "error", err)
|
|
var err error
|
|
cfg, err := fetchBootstrapConfig(ctx, client, dataDir)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
logger.Debug("configuration fetched from HCP and saved on local disk")
|
|
token = cfg.ManagementToken
|
|
} else {
|
|
logger.Trace("loaded HCP configuration from local disk")
|
|
}
|
|
|
|
return token, nil
|
|
}
|