consul/agent/hcp/bootstrap/bootstrap.go
Nick Cellino 5fb6ab6a3a
Move HCP Manager lifecycle management out of Link controller (#20401)
* Add function to get update channel for watching HCP Link

* Add MonitorHCPLink function

This function can be called in a goroutine to manage the lifecycle
of the HCP manager.

* Update HCP Manager config in link monitor before starting

This updates HCPMonitorLink so it updates the HCP manager
with an HCP client and management token when a Link is upserted.

* Let MonitorHCPManager handle lifecycle instead of link controller

* Remove cleanup from Link controller and move it to MonitorHCPLink

Previously, the Link Controller was responsible for cleaning up the
HCP-related files on the file system. This change makes it so
MonitorHCPLink handles this cleanup. As a result, we are able to remove
the PlacementEachServer placement strategy for the Link controller
because it no longer needs to do this per-node cleanup.

* Remove HCP Manager dependency from Link Controller

The Link controller does not need to have HCP Manager
as a dependency anymore, so this removes that dependency
in order to simplify the design.

* Add Linked prefix to Linked status variables

This is in preparation for adding a new status type to the
Link resource.

* Add new "validated" status type to link resource

The link resource controller will now set a "validated" status
in addition to the "linked" status. This is needed so that other
components (eg the HCP manager) know when the Link is ready to link
with HCP.

* Fix tests

* Handle new 'EndOfSnapshot' WatchList event

* Fix watch test

* Remove unnecessary config from TestAgent_scadaProvider

Since the Scada provider is now started on agent startup
regardless of whether a cloud config is provided, this removes
the cloud config override from the relevant test.

This change is not exactly related to the changes from this PR,
but rather is something small and sort of related that was noticed
while working on this PR.

* Simplify link watch test and remove sleep from link watch

This updates the link watch test so that it uses more mocks
and does not require setting up the infrastructure for the HCP Link
controller.

This also removes the time.Sleep delay in the link watcher loop in favor
of an error counter. When we receive 10 consecutive errors, we shut down
the link watcher loop.

* Add better logging for link validation. Remove EndOfSnapshot test.

* Refactor link monitor test into a table test

* Add some clarifying comments to link monitor

* Simplify link watch test

* Test a bunch more errors cases in link monitor test

* Use exponential backoff instead of errorCounter in LinkWatch

* Move link watch and link monitor into a single goroutine called from server.go

* Refactor HCP link watcher to use single go-routine.

Previously, if the WatchClient errored, we would've never recovered
because we never retry to create the stream. With this change,
we have a single goroutine that runs for the life of the server agent
and if the WatchClient stream ever errors, we retry the creation
of the stream with an exponential backoff.
2024-02-12 10:48:23 -05:00

482 lines
15 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
// Package bootstrap handles bootstrapping an agent's config from HCP.
package bootstrap
import (
"bufio"
"context"
"crypto/tls"
"crypto/x509"
"encoding/json"
"encoding/pem"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-uuid"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/hcp/bootstrap/constants"
hcpclient "github.com/hashicorp/consul/agent/hcp/client"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/lib/retry"
)
const (
CAFileName = "server-tls-cas.pem"
CertFileName = "server-tls-cert.pem"
ConfigFileName = "server-config.json"
KeyFileName = "server-tls-key.pem"
TokenFileName = "hcp-management-token"
SuccessFileName = "successful-bootstrap"
)
// UI is a shim to allow the agent command to pass in it's mitchelh/cli.UI so we
// can output useful messages to the user during bootstrapping. For example if
// we have to retry several times to bootstrap we don't want the agent to just
// stall with no output which is the case if we just returned all intermediate
// warnings or errors.
type UI interface {
Output(string)
Warn(string)
Info(string)
Error(string)
}
// RawBootstrapConfig contains the Consul config as a raw JSON string and the management token
// which either was retrieved from persisted files or from the bootstrap endpoint
type RawBootstrapConfig struct {
ConfigJSON string
ManagementToken string
}
// FetchBootstrapConfig will fetch bootstrap configuration from remote servers and persist it to disk.
// It will retry until successful or a terminal error condition is found (e.g. permission denied).
func FetchBootstrapConfig(ctx context.Context, client hcpclient.Client, dataDir string, ui UI) (*RawBootstrapConfig, error) {
w := retry.Waiter{
MinWait: 1 * time.Second,
MaxWait: 5 * time.Minute,
Jitter: retry.NewJitter(50),
}
for {
// Note we don't want to shadow `ctx` here since we need that for the Wait
// below.
reqCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
cfg, err := fetchBootstrapConfig(reqCtx, client, dataDir)
if err != nil {
if errors.Is(err, hcpclient.ErrUnauthorized) || errors.Is(err, hcpclient.ErrForbidden) {
// Don't retry on terminal errors
return nil, err
}
ui.Error(fmt.Sprintf("Error: failed to fetch bootstrap config from HCP, will retry in %s: %s",
w.NextWait().Round(time.Second), err))
if err := w.Wait(ctx); err != nil {
return nil, err
}
// Finished waiting, restart loop
continue
}
return cfg, nil
}
}
// fetchBootstrapConfig will fetch the bootstrap configuration from remote servers and persist it to disk.
func fetchBootstrapConfig(ctx context.Context, client hcpclient.Client, dataDir string) (*RawBootstrapConfig, error) {
reqCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
resp, err := client.FetchBootstrap(reqCtx)
if err != nil {
return nil, fmt.Errorf("failed to fetch bootstrap config from HCP: %w", err)
}
bsCfg := resp
devMode := dataDir == ""
cfgJSON, err := persistAndProcessConfig(dataDir, devMode, bsCfg)
if err != nil {
return nil, fmt.Errorf("failed to persist config for existing cluster: %w", err)
}
return &RawBootstrapConfig{
ConfigJSON: cfgJSON,
ManagementToken: bsCfg.ManagementToken,
}, nil
}
// persistAndProcessConfig is called when we receive data from CCM.
// We validate and persist everything that was received, then also update
// the JSON config as needed.
func persistAndProcessConfig(dataDir string, devMode bool, bsCfg *hcpclient.BootstrapConfig) (string, error) {
if devMode {
// Agent in dev mode, we still need somewhere to persist the certs
// temporarily though to be able to start up at all since we don't support
// inline certs right now. Use temp dir
tmp, err := os.MkdirTemp(os.TempDir(), "consul-dev-")
if err != nil {
return "", fmt.Errorf("failed to create temp dir for certificates: %w", err)
}
dataDir = tmp
}
// Create subdir if it's not already there.
dir := filepath.Join(dataDir, constants.SubDir)
if err := lib.EnsurePath(dir, true); err != nil {
return "", fmt.Errorf("failed to ensure directory %q: %w", dir, err)
}
// Parse just to a map for now as we only have to inject to a specific place
// and parsing whole Config struct is complicated...
var cfg map[string]any
if err := json.Unmarshal([]byte(bsCfg.ConsulConfig), &cfg); err != nil {
return "", fmt.Errorf("failed to unmarshal bootstrap config: %w", err)
}
// Avoid ever setting an initial_management token from HCP now that we can
// separately bootstrap an HCP management token with a distinct accessor ID.
//
// CCM will continue to return an initial_management token because previous versions of Consul
// cannot bootstrap an HCP management token distinct from the initial management token.
// This block can be deleted once CCM supports tailoring bootstrap config responses
// based on the version of Consul that requested it.
acls, aclsOK := cfg["acl"].(map[string]any)
if aclsOK {
tokens, tokensOK := acls["tokens"].(map[string]interface{})
if tokensOK {
delete(tokens, "initial_management")
}
}
var cfgJSON string
if bsCfg.TLSCert != "" {
if err := ValidateTLSCerts(bsCfg.TLSCert, bsCfg.TLSCertKey, bsCfg.TLSCAs); err != nil {
return "", fmt.Errorf("invalid certificates: %w", err)
}
// Persist the TLS cert files from the response since we need to refer to them
// as disk files either way.
if err := persistTLSCerts(dir, bsCfg.TLSCert, bsCfg.TLSCertKey, bsCfg.TLSCAs); err != nil {
return "", fmt.Errorf("failed to persist TLS certificates to dir %q: %w", dataDir, err)
}
// Store paths to the persisted TLS cert files.
cfg["ca_file"] = filepath.Join(dir, CAFileName)
cfg["cert_file"] = filepath.Join(dir, CertFileName)
cfg["key_file"] = filepath.Join(dir, KeyFileName)
// Convert the bootstrap config map back into a string
cfgJSONBytes, err := json.Marshal(cfg)
if err != nil {
return "", err
}
cfgJSON = string(cfgJSONBytes)
}
if !devMode {
// Persist the final config we need to add so that it is available locally after a restart.
// Assuming the configured data dir wasn't a tmp dir to start with.
if err := persistBootstrapConfig(dir, cfgJSON); err != nil {
return "", fmt.Errorf("failed to persist bootstrap config: %w", err)
}
// HCP only returns the management token if it requires Consul to
// initialize it
if bsCfg.ManagementToken != "" {
if err := validateManagementToken(bsCfg.ManagementToken); err != nil {
return "", fmt.Errorf("invalid management token: %w", err)
}
if err := persistManagementToken(dir, bsCfg.ManagementToken); err != nil {
return "", fmt.Errorf("failed to persist HCP management token: %w", err)
}
}
if err := persistSuccessMarker(dir); err != nil {
return "", fmt.Errorf("failed to persist success marker: %w", err)
}
}
return cfgJSON, nil
}
func persistSuccessMarker(dir string) error {
name := filepath.Join(dir, SuccessFileName)
return os.WriteFile(name, []byte(""), 0600)
}
func persistTLSCerts(dir string, serverCert, serverKey string, caCerts []string) error {
if serverCert == "" || serverKey == "" {
return fmt.Errorf("unexpected bootstrap response from HCP: missing TLS information")
}
// Write out CA cert(s). We write them all to one file because Go's x509
// machinery will read as many certs as it finds from each PEM file provided
// and add them separaetly to the CertPool for validation
f, err := os.OpenFile(filepath.Join(dir, CAFileName), os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0600)
if err != nil {
return err
}
bf := bufio.NewWriter(f)
for _, caPEM := range caCerts {
bf.WriteString(caPEM + "\n")
}
if err := bf.Flush(); err != nil {
return err
}
if err := f.Close(); err != nil {
return err
}
if err := os.WriteFile(filepath.Join(dir, CertFileName), []byte(serverCert), 0600); err != nil {
return err
}
if err := os.WriteFile(filepath.Join(dir, KeyFileName), []byte(serverKey), 0600); err != nil {
return err
}
return nil
}
// Basic validation to ensure a UUID was loaded and assumes the token is non-empty
func validateManagementToken(token string) error {
// note: we assume that the token is not an empty string
if _, err := uuid.ParseUUID(token); err != nil {
return errors.New("management token is not a valid UUID")
}
return nil
}
func persistManagementToken(dir, token string) error {
name := filepath.Join(dir, TokenFileName)
return os.WriteFile(name, []byte(token), 0600)
}
func persistBootstrapConfig(dir, cfgJSON string) error {
// Persist the important bits we got from bootstrapping. The TLS certs are
// already persisted, just need to persist the config we are going to add.
name := filepath.Join(dir, ConfigFileName)
return os.WriteFile(name, []byte(cfgJSON), 0600)
}
func LoadPersistedBootstrapConfig(dataDir string, ui UI) (*RawBootstrapConfig, bool) {
if dataDir == "" {
// There's no files to load when in dev mode.
return nil, false
}
dir := filepath.Join(dataDir, constants.SubDir)
_, err := os.Stat(filepath.Join(dir, SuccessFileName))
if os.IsNotExist(err) {
// Haven't bootstrapped from HCP.
return nil, false
}
if err != nil {
ui.Warn("failed to check for config on disk, re-fetching from HCP: " + err.Error())
return nil, false
}
if err := checkCerts(dir); err != nil {
ui.Warn("failed to validate certs on disk, re-fetching from HCP: " + err.Error())
return nil, false
}
configJSON, err := loadBootstrapConfigJSON(dataDir)
if err != nil {
ui.Warn("failed to load bootstrap config from disk, re-fetching from HCP: " + err.Error())
return nil, false
}
mgmtToken, err := loadManagementToken(dir)
if err != nil {
ui.Warn("failed to load HCP management token from disk, re-fetching from HCP: " + err.Error())
return nil, false
}
return &RawBootstrapConfig{
ConfigJSON: configJSON,
ManagementToken: mgmtToken,
}, true
}
func loadBootstrapConfigJSON(dataDir string) (string, error) {
filename := filepath.Join(dataDir, constants.SubDir, ConfigFileName)
_, err := os.Stat(filename)
if os.IsNotExist(err) {
return "", nil
}
if err != nil {
return "", fmt.Errorf("failed to check for bootstrap config: %w", err)
}
jsonBs, err := os.ReadFile(filename)
if err != nil {
return "", fmt.Errorf(fmt.Sprintf("failed to read local bootstrap config file: %s", err))
}
return strings.TrimSpace(string(jsonBs)), nil
}
func loadManagementToken(dir string) (string, error) {
name := filepath.Join(dir, TokenFileName)
bytes, err := os.ReadFile(name)
if os.IsNotExist(err) {
return "", errors.New("configuration files on disk are incomplete, missing: " + name)
}
if err != nil {
return "", fmt.Errorf("failed to read: %w", err)
}
token := string(bytes)
if err := validateManagementToken(token); err != nil {
return "", fmt.Errorf("invalid management token: %w", err)
}
return token, nil
}
func checkCerts(dir string) error {
files := []string{
filepath.Join(dir, CAFileName),
filepath.Join(dir, CertFileName),
filepath.Join(dir, KeyFileName),
}
missing := make([]string, 0)
for _, file := range files {
_, err := os.Stat(file)
if os.IsNotExist(err) {
missing = append(missing, file)
continue
}
if err != nil {
return err
}
}
// If all the TLS files are missing, assume this is intentional.
// Existing clusters do not receive any TLS certs.
if len(missing) == len(files) {
return nil
}
// If only some of the files are missing, something went wrong.
if len(missing) > 0 {
return fmt.Errorf("configuration files on disk are incomplete, missing: %v", missing)
}
cert, key, caCerts, err := LoadCerts(dir)
if err != nil {
return fmt.Errorf("failed to load certs from disk: %w", err)
}
if err = ValidateTLSCerts(cert, key, caCerts); err != nil {
return fmt.Errorf("invalid certs on disk: %w", err)
}
return nil
}
func LoadCerts(dir string) (cert, key string, caCerts []string, err error) {
certPEMBlock, err := os.ReadFile(filepath.Join(dir, CertFileName))
if err != nil {
return "", "", nil, err
}
keyPEMBlock, err := os.ReadFile(filepath.Join(dir, KeyFileName))
if err != nil {
return "", "", nil, err
}
caPEMs, err := os.ReadFile(filepath.Join(dir, CAFileName))
if err != nil {
return "", "", nil, err
}
caCerts, err = splitCACerts(caPEMs)
if err != nil {
return "", "", nil, fmt.Errorf("failed to parse CA certs: %w", err)
}
return string(certPEMBlock), string(keyPEMBlock), caCerts, nil
}
// splitCACerts takes a list of concatenated PEM blocks and splits
// them back up into strings. This is used because CACerts are written
// into a single file, but validated individually.
func splitCACerts(caPEMs []byte) ([]string, error) {
var out []string
for {
nextBlock, remaining := pem.Decode(caPEMs)
if nextBlock == nil {
break
}
if nextBlock.Type != "CERTIFICATE" {
return nil, fmt.Errorf("PEM-block should be CERTIFICATE type")
}
// Collect up to the start of the remaining bytes.
// We don't grab nextBlock.Bytes because it's not PEM encoded.
out = append(out, string(caPEMs[:len(caPEMs)-len(remaining)]))
caPEMs = remaining
}
if len(out) == 0 {
return nil, errors.New("invalid CA certificate")
}
return out, nil
}
// ValidateTLSCerts checks that the CA cert, server cert, and key on disk are structurally valid.
//
// OPTIMIZE: This could be improved by returning an error if certs are expired or close to expiration.
// However, that requires issuing new certs on bootstrap requests, since returning an error
// would trigger a re-fetch from HCP.
func ValidateTLSCerts(cert, key string, caCerts []string) error {
leaf, err := tls.X509KeyPair([]byte(cert), []byte(key))
if err != nil {
return errors.New("invalid server certificate or key")
}
_, err = x509.ParseCertificate(leaf.Certificate[0])
if err != nil {
return errors.New("invalid server certificate")
}
for _, caCert := range caCerts {
_, err = connect.ParseCert(caCert)
if err != nil {
return errors.New("invalid CA certificate")
}
}
return nil
}
// LoadManagementToken returns the management token, either by loading it from the persisted
// token config file or by fetching it from HCP if the token file does not exist.
func LoadManagementToken(ctx context.Context, logger hclog.Logger, client hcpclient.Client, dataDir string) (string, error) {
hcpCfgDir := filepath.Join(dataDir, constants.SubDir)
token, err := loadManagementToken(hcpCfgDir)
if err != nil {
logger.Debug("failed to load management token from local disk, fetching configuration from HCP", "error", err)
var err error
cfg, err := fetchBootstrapConfig(ctx, client, dataDir)
if err != nil {
return "", err
}
logger.Debug("configuration fetched from HCP and saved on local disk")
token = cfg.ManagementToken
} else {
logger.Trace("loaded HCP configuration from local disk")
}
return token, nil
}