R.B. Boyer b2979f6edf
testing/deployer: rename various terms to better align with v2 and avoid confusion (#19600)
Conceptually renaming the following topology terms to avoid confusion with v2 and to better align with it:

- ServiceID -> ID
- Service -> Workload
- Upstream -> Destination
2023-11-10 13:22:06 -06:00

536 lines
13 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package sprawl
import (
"bufio"
"bytes"
"context"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/proto-public/pbresource"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-multierror"
"github.com/mitchellh/copystructure"
"google.golang.org/grpc"
"github.com/hashicorp/consul/testing/deployer/sprawl/internal/runner"
"github.com/hashicorp/consul/testing/deployer/sprawl/internal/secrets"
"github.com/hashicorp/consul/testing/deployer/sprawl/internal/tfgen"
"github.com/hashicorp/consul/testing/deployer/topology"
"github.com/hashicorp/consul/testing/deployer/util"
)
// TODO: manage workdir externally without chdir
// Sprawl is the definition of a complete running Consul deployment topology.
type Sprawl struct {
logger hclog.Logger
// set after initial Launch is complete
launchLogger hclog.Logger
runner *runner.Runner
license string
secrets secrets.Store
workdir string
// set during Run
config *topology.Config
topology *topology.Topology
generator *tfgen.Generator
clients map[string]*api.Client // one per cluster
grpcConns map[string]*grpc.ClientConn // one per cluster (when v2 enabled)
grpcConnCancel map[string]func() // one per cluster (when v2 enabled)
}
// Topology allows access to the topology that defines the resources. Do not
// write to any of these fields.
func (s *Sprawl) Topology() *topology.Topology {
return s.topology
}
func (s *Sprawl) Config() *topology.Config {
c2, err := copyConfig(s.config)
if err != nil {
panic(err)
}
return c2
}
// ResourceServiceClientForCluster returns a shared common client that defaults
// to using the management token for this cluster.
func (s *Sprawl) ResourceServiceClientForCluster(clusterName string) pbresource.ResourceServiceClient {
return pbresource.NewResourceServiceClient(s.grpcConns[clusterName])
}
func (s *Sprawl) HTTPClientForCluster(clusterName string) (*http.Client, error) {
cluster, ok := s.topology.Clusters[clusterName]
if !ok {
return nil, fmt.Errorf("no such cluster: %s", clusterName)
}
// grab the local network for the cluster
network, ok := s.topology.Networks[cluster.NetworkName]
if !ok {
return nil, fmt.Errorf("no such network: %s", cluster.NetworkName)
}
transport, err := util.ProxyHTTPTransport(network.ProxyPort)
if err != nil {
return nil, err
}
return &http.Client{Transport: transport}, nil
}
// APIClientForNode gets a pooled api.Client connected to the agent running on
// the provided node.
//
// Passing an empty token will assume the bootstrap token. If you want to
// actually use the anonymous token say "-".
func (s *Sprawl) APIClientForNode(clusterName string, nid topology.NodeID, token string) (*api.Client, error) {
cluster, ok := s.topology.Clusters[clusterName]
if !ok {
return nil, fmt.Errorf("no such cluster: %s", clusterName)
}
nid.Normalize()
node := cluster.NodeByID(nid)
if !node.IsAgent() {
return nil, fmt.Errorf("node is not an agent")
}
switch token {
case "":
token = s.secrets.ReadGeneric(clusterName, secrets.BootstrapToken)
case "-":
token = ""
}
return util.ProxyAPIClient(
node.LocalProxyPort(),
node.LocalAddress(),
8500,
token,
)
}
// APIClientForCluster is a convenience wrapper for APIClientForNode that returns
// an API client for an agent node in the cluster, preferring clients, then servers
func (s *Sprawl) APIClientForCluster(clusterName, token string) (*api.Client, error) {
clu := s.topology.Clusters[clusterName]
// TODO: this always goes to the first client, but we might want to balance this
firstAgent := clu.FirstClient()
if firstAgent == nil {
firstAgent = clu.FirstServer()
}
if firstAgent == nil {
return nil, fmt.Errorf("failed to find agent in cluster %s", clusterName)
}
return s.APIClientForNode(clusterName, firstAgent.ID(), token)
}
func copyConfig(cfg *topology.Config) (*topology.Config, error) {
dup, err := copystructure.Copy(cfg)
if err != nil {
return nil, err
}
return dup.(*topology.Config), nil
}
// Launch will create the topology defined by the provided configuration and
// bring up all of the relevant clusters. Once created the Stop method must be
// called to destroy everything.
func Launch(
logger hclog.Logger,
workdir string,
cfg *topology.Config,
) (*Sprawl, error) {
if logger == nil {
panic("logger is required")
}
if workdir == "" {
panic("workdir is required")
}
if err := os.MkdirAll(filepath.Join(workdir, "terraform"), 0755); err != nil {
return nil, err
}
runner, err := runner.Load(logger)
if err != nil {
return nil, err
}
// Copy this to avoid leakage.
cfg, err = copyConfig(cfg)
if err != nil {
return nil, err
}
s := &Sprawl{
logger: logger,
runner: runner,
workdir: workdir,
clients: make(map[string]*api.Client),
grpcConns: make(map[string]*grpc.ClientConn),
grpcConnCancel: make(map[string]func()),
}
if err := s.ensureLicense(); err != nil {
return nil, err
}
// Copy this AGAIN, BEFORE compiling so we capture the original definition, without denorms.
s.config, err = copyConfig(cfg)
if err != nil {
return nil, err
}
s.topology, err = topology.Compile(logger.Named("compile"), cfg)
if err != nil {
return nil, fmt.Errorf("topology.Compile: %w", err)
}
s.logger.Debug("compiled topology", "ct", jd(s.topology)) // TODO
start := time.Now()
if err := s.launch(); err != nil {
return nil, err
}
s.logger.Info("topology is ready for use", "elapsed", time.Since(start))
if err := s.PrintDetails(); err != nil {
return nil, fmt.Errorf("error gathering diagnostic details: %w", err)
}
s.launchLogger = s.logger
return s, nil
}
func (s *Sprawl) Relaunch(
cfg *topology.Config,
) error {
return s.RelaunchWithPhase(cfg, "")
}
func (s *Sprawl) RelaunchWithPhase(
cfg *topology.Config,
phase string,
) error {
// Copy this BEFORE compiling so we capture the original definition, without denorms.
var err error
s.config, err = copyConfig(cfg)
if err != nil {
return err
}
if phase != "" {
s.logger = s.launchLogger.Named(phase)
}
newTopology, err := topology.Recompile(s.logger.Named("recompile"), cfg, s.topology)
if err != nil {
return fmt.Errorf("topology.Compile: %w", err)
}
s.topology = newTopology
s.logger.Debug("compiled replacement topology", "ct", jd(s.topology)) // TODO
start := time.Now()
if err := s.relaunch(); err != nil {
return err
}
s.logger.Info("topology is ready for use", "elapsed", time.Since(start))
if err := s.PrintDetails(); err != nil {
return fmt.Errorf("error gathering diagnostic details: %w", err)
}
return nil
}
// SnapshotSave saves a snapshot of a cluster and restore with the snapshot
func (s *Sprawl) SnapshotSave(clusterName string) error {
cluster, ok := s.topology.Clusters[clusterName]
if !ok {
return fmt.Errorf("no such cluster: %s", clusterName)
}
var (
client = s.clients[cluster.Name]
)
snapshot := client.Snapshot()
snap, _, err := snapshot.Save(nil)
if err != nil {
return fmt.Errorf("error saving snapshot: %w", err)
}
s.logger.Info("snapshot saved")
time.Sleep(3 * time.Second)
defer snap.Close()
// Restore the snapshot.
if err := snapshot.Restore(nil, snap); err != nil {
return fmt.Errorf("error restoring snapshot: %w", err)
}
s.logger.Info("snapshot restored")
return nil
}
// Leader returns the cluster leader agent, or an error if no leader is
// available.
func (s *Sprawl) Leader(clusterName string) (*topology.Node, error) {
cluster, ok := s.topology.Clusters[clusterName]
if !ok {
return nil, fmt.Errorf("no such cluster: %s", clusterName)
}
var (
client = s.clients[cluster.Name]
// logger = s.logger.With("cluster", cluster.Name)
)
leaderAddr, err := getLeader(client)
if err != nil {
return nil, err
}
for _, node := range cluster.Nodes {
if !node.IsServer() || node.Disabled {
continue
}
if strings.HasPrefix(leaderAddr, node.LocalAddress()+":") {
return node, nil
}
}
return nil, fmt.Errorf("leader not found")
}
// Followers returns the cluster following servers.
func (s *Sprawl) Followers(clusterName string) ([]*topology.Node, error) {
cluster, ok := s.topology.Clusters[clusterName]
if !ok {
return nil, fmt.Errorf("no such cluster: %s", clusterName)
}
leaderNode, err := s.Leader(clusterName)
if err != nil {
return nil, fmt.Errorf("could not determine leader: %w", err)
}
var followers []*topology.Node
for _, node := range cluster.Nodes {
if !node.IsServer() || node.Disabled {
continue
}
if node.ID() != leaderNode.ID() {
followers = append(followers, node)
}
}
return followers, nil
}
func (s *Sprawl) DisabledServers(clusterName string) ([]*topology.Node, error) {
cluster, ok := s.topology.Clusters[clusterName]
if !ok {
return nil, fmt.Errorf("no such cluster: %s", clusterName)
}
var servers []*topology.Node
for _, node := range cluster.Nodes {
if !node.IsServer() || !node.Disabled {
continue
}
servers = append(servers, node)
}
return servers, nil
}
func (s *Sprawl) StopContainer(ctx context.Context, containerName string) error {
return s.runner.DockerExec(ctx, []string{"stop", containerName}, nil, nil)
}
func (s *Sprawl) SnapshotEnvoy(ctx context.Context) error {
snapDir := filepath.Join(s.workdir, "envoy-snapshots")
if err := os.MkdirAll(snapDir, 0755); err != nil {
return fmt.Errorf("could not create envoy snapshot output dir %s: %w", snapDir, err)
}
targets := map[string]string{
"config_dump.json": "config_dump",
"clusters.json": "clusters?format=json",
"stats.txt": "stats",
"stats_prometheus.txt": "stats/prometheus",
}
var merr error
for _, c := range s.topology.Clusters {
client, err := s.HTTPClientForCluster(c.Name)
if err != nil {
return fmt.Errorf("could not get http client for cluster %q: %w", c.Name, err)
}
for _, n := range c.Nodes {
if n.Disabled {
continue
}
for _, wrk := range n.Workloads {
if wrk.Disabled || wrk.EnvoyAdminPort <= 0 {
continue
}
prefix := fmt.Sprintf("http://%s:%d", n.LocalAddress(), wrk.EnvoyAdminPort)
for fn, target := range targets {
u := prefix + "/" + target
body, err := scrapeURL(client, u)
if err != nil {
merr = multierror.Append(merr, fmt.Errorf("could not scrape %q for %s on %s: %w",
target, wrk.ID.String(), n.ID().String(), err,
))
continue
}
outFn := filepath.Join(snapDir, n.DockerName()+"--"+wrk.ID.TFString()+"."+fn)
if err := os.WriteFile(outFn+".tmp", body, 0644); err != nil {
merr = multierror.Append(merr, fmt.Errorf("could not write output %q for %s on %s: %w",
target, wrk.ID.String(), n.ID().String(), err,
))
continue
}
if err := os.Rename(outFn+".tmp", outFn); err != nil {
merr = multierror.Append(merr, fmt.Errorf("could not write output %q for %s on %s: %w",
target, wrk.ID.String(), n.ID().String(), err,
))
continue
}
}
}
}
}
return merr
}
func scrapeURL(client *http.Client, url string) ([]byte, error) {
res, err := client.Get(url)
if err != nil {
return nil, err
}
defer res.Body.Close()
body, err := io.ReadAll(res.Body)
if err != nil {
return nil, err
}
return body, nil
}
func (s *Sprawl) CaptureLogs(ctx context.Context) error {
logDir := filepath.Join(s.workdir, "logs")
if err := os.MkdirAll(logDir, 0755); err != nil {
return fmt.Errorf("could not create log output dir %s: %w", logDir, err)
}
containers, err := s.listContainers(ctx)
if err != nil {
return err
}
s.logger.Debug("Capturing logs")
var merr error
for _, container := range containers {
if err := s.dumpContainerLogs(ctx, container, logDir); err != nil {
merr = multierror.Append(merr, fmt.Errorf("could not dump logs for container %s: %w", container, err))
}
}
return merr
}
// Dump known containers out of terraform state file.
func (s *Sprawl) listContainers(ctx context.Context) ([]string, error) {
tfdir := filepath.Join(s.workdir, "terraform")
var buf bytes.Buffer
if err := s.runner.TerraformExec(ctx, []string{"state", "list"}, &buf, tfdir); err != nil {
return nil, fmt.Errorf("error listing containers in terraform state file: %w", err)
}
var (
scan = bufio.NewScanner(&buf)
containers []string
)
for scan.Scan() {
line := strings.TrimSpace(scan.Text())
name := strings.TrimPrefix(line, "docker_container.")
if name != line {
containers = append(containers, name)
continue
}
}
if err := scan.Err(); err != nil {
return nil, err
}
return containers, nil
}
func (s *Sprawl) dumpContainerLogs(ctx context.Context, containerName, outputRoot string) error {
path := filepath.Join(outputRoot, containerName+".log")
f, err := os.Create(path + ".tmp")
if err != nil {
return err
}
keep := false
defer func() {
_ = f.Close()
if !keep {
_ = os.Remove(path + ".tmp")
_ = os.Remove(path)
}
}()
err = s.runner.DockerExecWithStderr(
ctx,
[]string{"logs", containerName},
f,
f,
nil,
)
if err != nil {
return err
}
if err := f.Close(); err != nil {
return err
}
if err := os.Rename(path+".tmp", path); err != nil {
return err
}
keep = true
return nil
}