consul/agent/testagent.go
Derek Menteer b8b8ad46fc
Various race condition and test fixes. (#20212)
* Increase timeouts for flakey peering test.

* Various test fixes.

* Fix race condition in reconcilePeering.

This resolves an issue where a peering object in the state store was
incorrectly mutated by a function, resulting in the test being flagged as
failing when the -race flag was used.
2024-01-16 08:57:43 -06:00

747 lines
20 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package agent
import (
"bytes"
"context"
"crypto/x509"
"fmt"
"io"
"net"
"net/http/httptest"
"path/filepath"
"strconv"
"strings"
"testing"
"text/template"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-uuid"
"github.com/stretchr/testify/require"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/config"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/sdk/freeport"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/tlsutil"
)
// TestAgent encapsulates an Agent with a default configuration and
// startup procedure suitable for testing. It panics if there are errors
// during creation or startup instead of returning errors. It manages a
// temporary data directory which is removed after shutdown.
type TestAgent struct {
// Name is an optional name of the agent.
Name string
configFiles []string
HCL string
// Config is the agent configuration. If Config is nil then
// TestConfig() is used. If Config.DataDir is set then it is
// the callers responsibility to clean up the data directory.
// Otherwise, a temporary data directory is created and removed
// when Shutdown() is called.
Config *config.RuntimeConfig
// LogOutput is the sink for the logs. If nil, logs are written to os.Stderr.
// The io.Writer must allow concurrent reads and writes. Note that
// bytes.Buffer is not safe for concurrent reads and writes.
LogOutput io.Writer
LogLevel hclog.Level
// DataDir may be set to a directory which exists. If is it not set,
// TestAgent.Start will create one and set DataDir to the directory path.
// In all cases the agent will be configured to use this path as the data directory,
// and the directory will be removed once the test ends.
DataDir string
// UseHTTPS, if true, will disable the HTTP port and enable the HTTPS
// one.
UseHTTPS bool
// UseGRPCTLS, if true, will disable the GRPC port and enable the GRPC+TLS
// one.
UseGRPCTLS bool
// dns is a reference to the first started DNS endpoint.
// It is valid after Start().
dns dnsServer
// srv is an HTTPHandlers that may be used to test http endpoints.
srv *HTTPHandlers
// overrides is an hcl config source to use to override otherwise
// non-user settable configurations
Overrides string
// allows the BaseDeps to be modified before starting the embedded agent
OverrideDeps func(deps *BaseDeps)
// Skips asserting that the ACL bootstrap has occurred. This may be required
// for various tests where multiple servers are joined later.
disableACLBootstrapCheck bool
// Agent is the embedded consul agent.
// It is valid after Start().
*Agent
}
type TestAgentOpts struct {
// Skips asserting that the ACL bootstrap has occurred. This may be required
// for various tests where multiple servers are joined later.
DisableACLBootstrapCheck bool
}
// NewTestAgent returns a started agent with the given configuration. It fails
// the test if the Agent could not be started.
func NewTestAgent(t *testing.T, hcl string, opts ...TestAgentOpts) *TestAgent {
// This varargs approach is used so that we don't have to modify all of the `NewTestAgent()` calls
// in order to introduce more optional arguments.
require.LessOrEqual(t, len(opts), 1, "NewTestAgent cannot accept more than one opts argument")
ta := TestAgent{HCL: hcl}
if len(opts) == 1 {
ta.disableACLBootstrapCheck = opts[0].DisableACLBootstrapCheck
}
a := StartTestAgent(t, ta)
t.Cleanup(func() { a.Shutdown() })
return a
}
// NewTestAgent returns a started agent with the given configuration. It fails
// the test if the Agent could not be started.
// The caller is responsible for calling Shutdown() to stop the agent and remove
// temporary directories.
func NewTestAgentWithConfigFile(t *testing.T, hcl string, configFiles []string) *TestAgent {
a := StartTestAgent(t, TestAgent{configFiles: configFiles, HCL: hcl})
t.Cleanup(func() { a.Shutdown() })
return a
}
// StartTestAgent and wait for it to become available. If the agent fails to
// start the test will be marked failed and execution will stop.
//
// The caller is responsible for calling Shutdown() to stop the agent and remove
// temporary directories.
func StartTestAgent(t *testing.T, a TestAgent) *TestAgent {
t.Helper()
retry.RunWith(retry.ThreeTimes(), t, func(r *retry.R) {
r.Helper()
if err := a.Start(r); err != nil {
r.Fatal(err)
}
})
return &a
}
func TestConfigHCL(nodeID string) string {
return fmt.Sprintf(`
bind_addr = "127.0.0.1"
advertise_addr = "127.0.0.1"
datacenter = "dc1"
bootstrap = true
server = true
node_id = "%[1]s"
node_name = "Node-%[1]s"
connect {
enabled = true
ca_config {
cluster_id = "%[2]s"
}
}
performance {
raft_multiplier = 1
}
peering {
enabled = true
}`, nodeID, connect.TestClusterID,
)
}
// Start starts a test agent. It returns an error if the agent could not be started.
// If no error is returned, the caller must call Shutdown() when finished.
func (a *TestAgent) Start(t testutil.TestingTB) error {
t.Helper()
if a.Agent != nil {
return fmt.Errorf("TestAgent already started")
}
name := a.Name
if name == "" {
name = "TestAgent"
}
if a.DataDir == "" {
dirname := name + "-agent"
a.DataDir = testutil.TempDir(t, dirname)
}
// Convert windows style path to posix style path to avoid illegal char escape
// error when hcl parsing.
d := filepath.ToSlash(a.DataDir)
hclDataDir := fmt.Sprintf(`data_dir = "%s"`, d)
logOutput := a.LogOutput
if logOutput == nil {
logOutput = testutil.NewLogBuffer(t)
}
if a.LogLevel == 0 {
a.LogLevel = testutil.TestLogLevel
}
logger := hclog.NewInterceptLogger(&hclog.LoggerOptions{
Level: a.LogLevel,
Output: logOutput,
TimeFormat: "04:05.000",
Name: name,
})
portsConfig := randomPortsSource(t, a.UseHTTPS)
// Create NodeID outside the closure, so that it does not change
testHCLConfig := TestConfigHCL(NodeID())
loader := func(source config.Source) (config.LoadResult, error) {
opts := config.LoadOpts{
DefaultConfig: source,
HCL: []string{testHCLConfig, portsConfig, a.HCL, hclDataDir},
Overrides: []config.Source{
config.FileSource{
Name: "test-overrides",
Format: "hcl",
Data: a.Overrides},
config.DefaultConsulSource(),
config.DevConsulSource(),
},
ConfigFiles: a.configFiles,
}
result, err := config.Load(opts)
if result.RuntimeConfig != nil {
// If prom metrics need to be enabled, do not disable telemetry
if result.RuntimeConfig.Telemetry.PrometheusOpts.Expiration > 0 {
result.RuntimeConfig.Telemetry.Disable = false
} else {
result.RuntimeConfig.Telemetry.Disable = true
}
// Lower the resync interval for tests.
result.RuntimeConfig.LocalProxyConfigResyncInterval = 250 * time.Millisecond
}
return result, err
}
bd, err := NewBaseDeps(loader, logOutput, logger)
if err != nil {
return fmt.Errorf("failed to create base deps: %w", err)
}
bd.Logger = logger
// if we are not testing telemetry things, let's use a "mock" sink for metrics
if bd.RuntimeConfig.Telemetry.Disable {
bd.MetricsConfig = &lib.MetricsConfig{
Handler: metrics.NewInmemSink(1*time.Second, time.Minute),
}
}
if a.Config != nil && bd.RuntimeConfig.AutoReloadConfigCoalesceInterval == 0 {
bd.RuntimeConfig.AutoReloadConfigCoalesceInterval = a.Config.AutoReloadConfigCoalesceInterval
}
a.Config = bd.RuntimeConfig
if a.OverrideDeps != nil {
a.OverrideDeps(&bd)
}
agent, err := New(bd)
if err != nil {
return fmt.Errorf("Error creating agent: %s", err)
}
id := string(a.Config.NodeID)
if err := agent.Start(context.Background()); err != nil {
agent.ShutdownAgent()
agent.ShutdownEndpoints()
return fmt.Errorf("%s %s Error starting agent: %s", id, name, err)
}
a.Agent = agent
// Start the anti-entropy syncer
a.Agent.StartSync()
a.srv = a.Agent.httpHandlers
if err := a.waitForUp(); err != nil {
a.Shutdown()
a.Agent = nil
return fmt.Errorf("error waiting for test agent to start: %w", err)
}
a.dns = a.dnsServers[0]
return nil
}
// waitForUp waits for leader election, or waits for the agent HTTP
// endpoint to start responding, depending on the agent config.
func (a *TestAgent) waitForUp() error {
timer := retry.TwoSeconds()
deadline := time.Now().Add(timer.Timeout)
var retErr error
var out structs.IndexedNodes
for ; !time.Now().After(deadline); time.Sleep(timer.Wait) {
if len(a.apiServers.servers) == 0 {
retErr = fmt.Errorf("waiting for server")
continue // fail, try again
}
if a.Config.Bootstrap && a.Config.ServerMode {
if !a.disableACLBootstrapCheck {
if ok, err := a.isACLBootstrapped(); err != nil {
retErr = fmt.Errorf("error checking for acl bootstrap: %w", err)
continue // fail, try again
} else if !ok {
retErr = fmt.Errorf("acl system not bootstrapped yet")
continue // fail, try again
}
}
if a.baseDeps.UseV2Resources() {
args := structs.DCSpecificRequest{
Datacenter: "dc1",
}
var leader string
if err := a.RPC(context.Background(), "Status.Leader", args, &leader); err != nil {
retErr = fmt.Errorf("Status.Leader failed: %v", err)
continue // fail, try again
}
if leader == "" {
retErr = fmt.Errorf("No leader")
continue // fail, try again
}
return nil // success
}
// Ensure we have a leader and a node registration.
args := &structs.DCSpecificRequest{
Datacenter: a.Config.Datacenter,
QueryOptions: structs.QueryOptions{
MinQueryIndex: out.Index,
MaxQueryTime: 25 * time.Millisecond,
},
}
if err := a.RPC(context.Background(), "Catalog.ListNodes", args, &out); err != nil {
retErr = fmt.Errorf("Catalog.ListNodes failed: %v", err)
continue // fail, try again
}
if !out.QueryMeta.KnownLeader {
retErr = fmt.Errorf("No leader")
continue // fail, try again
}
if out.Index == 0 {
retErr = fmt.Errorf("Consul index is 0")
continue // fail, try again
}
return nil // success
} else {
req := httptest.NewRequest("GET", "/v1/agent/self", nil)
resp := httptest.NewRecorder()
_, err := a.srv.AgentSelf(resp, req)
if acl.IsErrPermissionDenied(err) || resp.Code == 403 {
// permission denied is enough to show that the client is
// connected to the servers as it would get a 503 if
// it couldn't connect to them.
} else if err != nil && resp.Code != 200 {
retErr = fmt.Errorf("failed OK response: %v", err)
continue
}
return nil // success
}
}
return fmt.Errorf("unavailable. last error: %v", retErr)
}
func (a *TestAgent) isACLBootstrapped() (bool, error) {
if a.config.ACLInitialManagementToken == "" {
logger := a.Agent.logger.Named("test")
logger.Warn("Skipping check for ACL bootstrapping")
return true, nil // We lie because we can't check.
}
const policyName = structs.ACLPolicyGlobalManagementName
req := httptest.NewRequest("GET", "/v1/acl/policy/name/"+policyName, nil)
req.Header.Add("X-Consul-Token", a.config.ACLInitialManagementToken)
resp := httptest.NewRecorder()
raw, err := a.srv.ACLPolicyReadByName(resp, req)
if err != nil {
if strings.Contains(err.Error(), "Unexpected response code: 403 (ACL not found)") {
return false, nil
} else if isACLNotBootstrapped(err) {
return false, nil
}
return false, err
}
if raw == nil {
return false, nil
}
policy, ok := raw.(*structs.ACLPolicy)
if !ok {
return false, fmt.Errorf("expected ACLPolicy got %T", raw)
}
return policy != nil, nil
}
func isACLNotBootstrapped(err error) bool {
switch {
case strings.Contains(err.Error(), "ACL system must be bootstrapped before making any requests that require authorization"):
return true
case strings.Contains(err.Error(), "The ACL system is currently in legacy mode"):
return true
}
return false
}
// Shutdown stops the agent and removes the data directory if it is
// managed by the test agent.
func (a *TestAgent) Shutdown() error {
if a.Agent == nil {
return nil
}
// shutdown agent before endpoints
defer a.Agent.ShutdownEndpoints()
if err := a.Agent.ShutdownAgent(); err != nil {
return err
}
<-a.Agent.ShutdownCh()
return nil
}
func (a *TestAgent) DNSAddr() string {
if a.dns == nil {
return ""
}
return a.dns.GetAddr()
}
func (a *TestAgent) HTTPAddr() string {
addr, err := firstAddr(a.Agent.apiServers, "http")
if err != nil {
// TODO: t.Fatal instead of panic
panic("no http server registered")
}
return addr.String()
}
// firstAddr is used by tests to look up the address for the first server which
// matches the protocol
func firstAddr(s *apiServers, protocol string) (net.Addr, error) {
for _, srv := range s.servers {
if srv.Protocol == protocol {
return srv.Addr, nil
}
}
return nil, fmt.Errorf("no server registered with protocol %v", protocol)
}
func (a *TestAgent) SegmentAddr(name string) string {
if server, ok := a.Agent.delegate.(*consul.Server); ok {
return server.LANSegmentAddr(name)
}
return ""
}
func (a *TestAgent) Client() *api.Client {
conf := api.DefaultConfig()
conf.Address = a.HTTPAddr()
c, err := api.NewClient(conf)
if err != nil {
panic(fmt.Sprintf("Error creating consul API client: %s", err))
}
return c
}
// DNSDisableCompression disables compression for all started DNS servers.
func (a *TestAgent) DNSDisableCompression(b bool) {
for _, srv := range a.dnsServers {
a.config.DNSDisableCompression = b
srv.ReloadConfig(a.config)
}
}
// FIXME: this should t.Fatal on error, not panic.
// TODO: rename to newConsulConfig
// TODO: remove TestAgent receiver, accept a.Agent.config as an arg
func (a *TestAgent) consulConfig() *consul.Config {
c, err := newConsulConfig(a.Agent.config, a.Agent.logger)
if err != nil {
panic(err)
}
return c
}
// Using sdk/freeport with *retry.R is not possible without changing
// function signatures. We use this shim instead to save the headache
// of syncing sdk submodule updates.
type retryShim struct {
*retry.R
name string
}
func (r *retryShim) Name() string {
return r.name
}
// pickRandomPorts selects random ports from fixed size random blocks of
// ports. This does not eliminate the chance for port conflict but
// reduces it significantly with little overhead. Furthermore, asking
// the kernel for a random port by binding to port 0 prolongs the test
// execution (in our case +20sec) while also not fully eliminating the
// chance of port conflicts for concurrently executed test binaries.
// Instead of relying on one set of ports to be sufficient we retry
// starting the agent with different ports on port conflict.
func randomPortsSource(t testutil.TestingTB, useHTTPS bool) string {
var ports []int
retry.RunWith(retry.TwoSeconds(), t, func(r *retry.R) {
ports = freeport.GetN(r, 7)
})
var http, https int
if useHTTPS {
http = -1
https = ports[1]
} else {
http = ports[1]
https = -1
}
return `
ports = {
dns = ` + strconv.Itoa(ports[0]) + `
http = ` + strconv.Itoa(http) + `
https = ` + strconv.Itoa(https) + `
serf_lan = ` + strconv.Itoa(ports[2]) + `
serf_wan = ` + strconv.Itoa(ports[3]) + `
server = ` + strconv.Itoa(ports[4]) + `
grpc = ` + strconv.Itoa(ports[5]) + `
grpc_tls = ` + strconv.Itoa(ports[6]) + `
}
`
}
func NodeID() string {
id, err := uuid.GenerateUUID()
if err != nil {
panic(err)
}
return id
}
// TestConfig returns a unique default configuration for testing an agent.
func TestConfig(logger hclog.Logger, sources ...config.Source) *config.RuntimeConfig {
nodeID := NodeID()
testsrc := config.FileSource{
Name: "test",
Format: "hcl",
Data: `
bind_addr = "127.0.0.1"
advertise_addr = "127.0.0.1"
datacenter = "dc1"
bootstrap = true
server = true
node_id = "` + nodeID + `"
node_name = "Node-` + nodeID + `"
connect {
enabled = true
ca_config {
cluster_id = "` + connect.TestClusterID + `"
}
}
performance {
raft_multiplier = 1
}
`,
}
opts := config.LoadOpts{
DefaultConfig: testsrc,
Overrides: sources,
}
r, err := config.Load(opts)
if err != nil {
panic("config.Load failed: " + err.Error())
}
for _, w := range r.Warnings {
logger.Warn(w)
}
cfg := r.RuntimeConfig
// Effectively disables the delay after root rotation before requesting CSRs
// to make test deterministic. 0 results in default jitter being applied but a
// tiny delay is effectively thre same.
cfg.ConnectTestCALeafRootChangeSpread = 1 * time.Nanosecond
// allows registering objects with the PeerName
cfg.PeeringTestAllowPeerRegistrations = true
return cfg
}
// TestACLConfig returns a default configuration for testing an agent
// with ACLs.
func TestACLConfig() string {
return `
primary_datacenter = "dc1"
acl {
enabled = true
default_policy = "deny"
tokens {
initial_management = "root"
agent = "root"
agent_recovery = "towel"
}
}
`
}
const (
TestDefaultInitialManagementToken = "d9f05e83-a7ae-47ce-839e-c0d53a68c00a"
TestDefaultAgentRecoveryToken = "bca580d4-db07-4074-b766-48acc9676955'"
)
type TestACLConfigParams struct {
PrimaryDatacenter string
DefaultPolicy string
InitialManagementToken string
AgentToken string
DefaultToken string
AgentRecoveryToken string
ReplicationToken string
DNSToken string
EnableTokenReplication bool
}
func DefaultTestACLConfigParams() *TestACLConfigParams {
return &TestACLConfigParams{
PrimaryDatacenter: "dc1",
DefaultPolicy: "deny",
InitialManagementToken: TestDefaultInitialManagementToken,
AgentToken: TestDefaultInitialManagementToken,
AgentRecoveryToken: TestDefaultAgentRecoveryToken,
}
}
func (p *TestACLConfigParams) HasConfiguredTokens() bool {
return p.InitialManagementToken != "" ||
p.AgentToken != "" ||
p.DefaultToken != "" ||
p.AgentRecoveryToken != "" ||
p.ReplicationToken != "" ||
p.DNSToken != ""
}
func TestACLConfigNew() string {
return TestACLConfigWithParams(&TestACLConfigParams{
PrimaryDatacenter: "dc1",
DefaultPolicy: "deny",
InitialManagementToken: "root",
AgentToken: "root",
AgentRecoveryToken: "towel",
DNSToken: "dns",
})
}
var aclConfigTpl = template.Must(template.New("ACL Config").Parse(`
{{- if ne .PrimaryDatacenter "" -}}
primary_datacenter = "{{ .PrimaryDatacenter }}"
{{end -}}
acl {
enabled = true
{{- if ne .DefaultPolicy ""}}
default_policy = "{{ .DefaultPolicy }}"
{{- end}}
enable_token_replication = {{printf "%t" .EnableTokenReplication }}
{{- if .HasConfiguredTokens}}
tokens {
{{- if ne .InitialManagementToken ""}}
initial_management = "{{ .InitialManagementToken }}"
{{- end}}
{{- if ne .AgentToken ""}}
agent = "{{ .AgentToken }}"
{{- end}}
{{- if ne .AgentRecoveryToken "" }}
agent_recovery = "{{ .AgentRecoveryToken }}"
{{- end}}
{{- if ne .DefaultToken "" }}
default = "{{ .DefaultToken }}"
{{- end}}
{{- if ne .ReplicationToken "" }}
replication = "{{ .ReplicationToken }}"
{{- end}}
}
{{- end}}
}
`))
func TestACLConfigWithParams(params *TestACLConfigParams) string {
var buf bytes.Buffer
cfg := params
if params == nil {
cfg = DefaultTestACLConfigParams()
}
err := aclConfigTpl.Execute(&buf, &cfg)
if err != nil {
panic(fmt.Sprintf("Failed to generate test ACL config: %v", err))
}
return buf.String()
}
// testTLSCertificates Generates a TLS CA and server key/cert and returns them
// in PEM encoded form.
func testTLSCertificates(serverName string) (cert string, key string, cacert string, err error) {
signer, _, err := tlsutil.GeneratePrivateKey()
if err != nil {
return "", "", "", err
}
ca, _, err := tlsutil.GenerateCA(tlsutil.CAOpts{Signer: signer})
if err != nil {
return "", "", "", err
}
cert, privateKey, err := tlsutil.GenerateCert(tlsutil.CertOpts{
Signer: signer,
CA: ca,
Name: "Test Cert Name",
Days: 365,
DNSNames: []string{serverName},
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth},
})
if err != nil {
return "", "", "", err
}
return cert, privateKey, ca, nil
}