2023-08-11 13:12:13 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
|
2023-07-17 22:15:22 +00:00
|
|
|
package sprawl
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
|
|
|
"context"
|
2023-11-16 00:32:37 +00:00
|
|
|
"crypto/rand"
|
2023-07-17 22:15:22 +00:00
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"net/http"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
2023-11-16 00:32:37 +00:00
|
|
|
retry "github.com/avast/retry-go"
|
2023-07-17 22:15:22 +00:00
|
|
|
"github.com/mitchellh/copystructure"
|
2023-11-02 19:25:48 +00:00
|
|
|
"google.golang.org/grpc"
|
2023-07-17 22:15:22 +00:00
|
|
|
|
2023-11-30 17:41:30 +00:00
|
|
|
"github.com/hashicorp/go-hclog"
|
|
|
|
"github.com/hashicorp/go-multierror"
|
|
|
|
|
|
|
|
"github.com/hashicorp/consul/api"
|
|
|
|
"github.com/hashicorp/consul/proto-public/pbresource"
|
2023-07-17 22:15:22 +00:00
|
|
|
"github.com/hashicorp/consul/testing/deployer/sprawl/internal/runner"
|
|
|
|
"github.com/hashicorp/consul/testing/deployer/sprawl/internal/secrets"
|
|
|
|
"github.com/hashicorp/consul/testing/deployer/sprawl/internal/tfgen"
|
|
|
|
"github.com/hashicorp/consul/testing/deployer/topology"
|
|
|
|
"github.com/hashicorp/consul/testing/deployer/util"
|
|
|
|
)
|
|
|
|
|
|
|
|
// TODO: manage workdir externally without chdir
|
|
|
|
|
|
|
|
// Sprawl is the definition of a complete running Consul deployment topology.
|
|
|
|
type Sprawl struct {
|
2023-11-03 16:43:43 +00:00
|
|
|
logger hclog.Logger
|
|
|
|
// set after initial Launch is complete
|
|
|
|
launchLogger hclog.Logger
|
|
|
|
runner *runner.Runner
|
|
|
|
license string
|
|
|
|
secrets secrets.Store
|
2023-07-17 22:15:22 +00:00
|
|
|
|
|
|
|
workdir string
|
|
|
|
|
|
|
|
// set during Run
|
|
|
|
config *topology.Config
|
|
|
|
topology *topology.Topology
|
|
|
|
generator *tfgen.Generator
|
|
|
|
|
2023-11-02 19:25:48 +00:00
|
|
|
clients map[string]*api.Client // one per cluster
|
|
|
|
grpcConns map[string]*grpc.ClientConn // one per cluster (when v2 enabled)
|
|
|
|
grpcConnCancel map[string]func() // one per cluster (when v2 enabled)
|
2023-07-17 22:15:22 +00:00
|
|
|
}
|
|
|
|
|
2023-11-16 00:32:37 +00:00
|
|
|
const (
|
|
|
|
UpgradeTypeStandard = "standard"
|
|
|
|
UpgradeTypeAutopilot = "autopilot"
|
|
|
|
)
|
|
|
|
|
2023-07-17 22:15:22 +00:00
|
|
|
// Topology allows access to the topology that defines the resources. Do not
|
|
|
|
// write to any of these fields.
|
|
|
|
func (s *Sprawl) Topology() *topology.Topology {
|
|
|
|
return s.topology
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Sprawl) Config() *topology.Config {
|
|
|
|
c2, err := copyConfig(s.config)
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return c2
|
|
|
|
}
|
|
|
|
|
2023-11-02 19:25:48 +00:00
|
|
|
// ResourceServiceClientForCluster returns a shared common client that defaults
|
|
|
|
// to using the management token for this cluster.
|
|
|
|
func (s *Sprawl) ResourceServiceClientForCluster(clusterName string) pbresource.ResourceServiceClient {
|
|
|
|
return pbresource.NewResourceServiceClient(s.grpcConns[clusterName])
|
|
|
|
}
|
|
|
|
|
2023-07-17 22:15:22 +00:00
|
|
|
func (s *Sprawl) HTTPClientForCluster(clusterName string) (*http.Client, error) {
|
|
|
|
cluster, ok := s.topology.Clusters[clusterName]
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("no such cluster: %s", clusterName)
|
|
|
|
}
|
|
|
|
|
|
|
|
// grab the local network for the cluster
|
|
|
|
network, ok := s.topology.Networks[cluster.NetworkName]
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("no such network: %s", cluster.NetworkName)
|
|
|
|
}
|
|
|
|
|
|
|
|
transport, err := util.ProxyHTTPTransport(network.ProxyPort)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return &http.Client{Transport: transport}, nil
|
|
|
|
}
|
|
|
|
|
2024-01-12 23:35:44 +00:00
|
|
|
// LocalAddressForNode returns the local address for the given node in the cluster
|
|
|
|
func (s *Sprawl) LocalAddressForNode(clusterName string, nid topology.NodeID) (string, error) {
|
|
|
|
cluster, ok := s.topology.Clusters[clusterName]
|
|
|
|
if !ok {
|
|
|
|
return "", fmt.Errorf("no such cluster: %s", clusterName)
|
|
|
|
}
|
|
|
|
node := cluster.NodeByID(nid)
|
|
|
|
if !node.IsAgent() {
|
|
|
|
return "", fmt.Errorf("node is not an agent")
|
|
|
|
}
|
|
|
|
return node.LocalAddress(), nil
|
|
|
|
}
|
|
|
|
|
2023-07-17 22:15:22 +00:00
|
|
|
// APIClientForNode gets a pooled api.Client connected to the agent running on
|
|
|
|
// the provided node.
|
|
|
|
//
|
|
|
|
// Passing an empty token will assume the bootstrap token. If you want to
|
|
|
|
// actually use the anonymous token say "-".
|
|
|
|
func (s *Sprawl) APIClientForNode(clusterName string, nid topology.NodeID, token string) (*api.Client, error) {
|
|
|
|
cluster, ok := s.topology.Clusters[clusterName]
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("no such cluster: %s", clusterName)
|
|
|
|
}
|
|
|
|
|
|
|
|
nid.Normalize()
|
|
|
|
|
|
|
|
node := cluster.NodeByID(nid)
|
|
|
|
if !node.IsAgent() {
|
|
|
|
return nil, fmt.Errorf("node is not an agent")
|
|
|
|
}
|
|
|
|
|
|
|
|
switch token {
|
|
|
|
case "":
|
|
|
|
token = s.secrets.ReadGeneric(clusterName, secrets.BootstrapToken)
|
|
|
|
case "-":
|
|
|
|
token = ""
|
|
|
|
}
|
|
|
|
|
|
|
|
return util.ProxyAPIClient(
|
|
|
|
node.LocalProxyPort(),
|
|
|
|
node.LocalAddress(),
|
|
|
|
8500,
|
|
|
|
token,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2023-09-06 23:46:34 +00:00
|
|
|
// APIClientForCluster is a convenience wrapper for APIClientForNode that returns
|
|
|
|
// an API client for an agent node in the cluster, preferring clients, then servers
|
|
|
|
func (s *Sprawl) APIClientForCluster(clusterName, token string) (*api.Client, error) {
|
|
|
|
clu := s.topology.Clusters[clusterName]
|
|
|
|
// TODO: this always goes to the first client, but we might want to balance this
|
2024-01-12 23:35:44 +00:00
|
|
|
firstAgent := clu.FirstClient("")
|
2023-09-06 23:46:34 +00:00
|
|
|
if firstAgent == nil {
|
|
|
|
firstAgent = clu.FirstServer()
|
|
|
|
}
|
|
|
|
if firstAgent == nil {
|
|
|
|
return nil, fmt.Errorf("failed to find agent in cluster %s", clusterName)
|
|
|
|
}
|
|
|
|
return s.APIClientForNode(clusterName, firstAgent.ID(), token)
|
|
|
|
}
|
|
|
|
|
2023-07-17 22:15:22 +00:00
|
|
|
func copyConfig(cfg *topology.Config) (*topology.Config, error) {
|
|
|
|
dup, err := copystructure.Copy(cfg)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return dup.(*topology.Config), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Launch will create the topology defined by the provided configuration and
|
|
|
|
// bring up all of the relevant clusters. Once created the Stop method must be
|
|
|
|
// called to destroy everything.
|
|
|
|
func Launch(
|
|
|
|
logger hclog.Logger,
|
|
|
|
workdir string,
|
|
|
|
cfg *topology.Config,
|
|
|
|
) (*Sprawl, error) {
|
|
|
|
if logger == nil {
|
|
|
|
panic("logger is required")
|
|
|
|
}
|
|
|
|
if workdir == "" {
|
|
|
|
panic("workdir is required")
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := os.MkdirAll(filepath.Join(workdir, "terraform"), 0755); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
runner, err := runner.Load(logger)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Copy this to avoid leakage.
|
|
|
|
cfg, err = copyConfig(cfg)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
s := &Sprawl{
|
2023-11-02 19:25:48 +00:00
|
|
|
logger: logger,
|
|
|
|
runner: runner,
|
|
|
|
workdir: workdir,
|
|
|
|
clients: make(map[string]*api.Client),
|
|
|
|
grpcConns: make(map[string]*grpc.ClientConn),
|
|
|
|
grpcConnCancel: make(map[string]func()),
|
2023-07-17 22:15:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if err := s.ensureLicense(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Copy this AGAIN, BEFORE compiling so we capture the original definition, without denorms.
|
|
|
|
s.config, err = copyConfig(cfg)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
s.topology, err = topology.Compile(logger.Named("compile"), cfg)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("topology.Compile: %w", err)
|
|
|
|
}
|
|
|
|
|
2023-10-03 15:06:50 +00:00
|
|
|
s.logger.Debug("compiled topology", "ct", jd(s.topology)) // TODO
|
2023-07-17 22:15:22 +00:00
|
|
|
|
|
|
|
start := time.Now()
|
|
|
|
if err := s.launch(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
s.logger.Info("topology is ready for use", "elapsed", time.Since(start))
|
|
|
|
|
|
|
|
if err := s.PrintDetails(); err != nil {
|
|
|
|
return nil, fmt.Errorf("error gathering diagnostic details: %w", err)
|
|
|
|
}
|
|
|
|
|
2023-11-03 16:43:43 +00:00
|
|
|
s.launchLogger = s.logger
|
|
|
|
|
2023-07-17 22:15:22 +00:00
|
|
|
return s, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Sprawl) Relaunch(
|
|
|
|
cfg *topology.Config,
|
2023-11-03 16:43:43 +00:00
|
|
|
) error {
|
2023-11-16 00:32:37 +00:00
|
|
|
return s.RelaunchWithPhase(cfg, LaunchPhaseRegular)
|
|
|
|
}
|
|
|
|
|
2024-01-18 22:04:06 +00:00
|
|
|
// Upgrade upgrades the cluster to the targetImages version
|
|
|
|
// Parameters:
|
|
|
|
// - clusterName: the cluster to upgrade
|
|
|
|
// - upgradeType: the type of upgrade, standard or autopilot
|
|
|
|
// - targetImages: the target version to upgrade to
|
|
|
|
// - newServersInTopology: the number of new servers to add to the topology for autopilot upgrade only
|
|
|
|
// - validationFunc: the validation function to run during upgrade
|
2023-11-16 00:32:37 +00:00
|
|
|
func (s *Sprawl) Upgrade(
|
|
|
|
cfg *topology.Config,
|
|
|
|
clusterName string,
|
|
|
|
upgradeType string,
|
|
|
|
targetImages topology.Images,
|
|
|
|
newServersInTopology []int,
|
2024-01-18 22:04:06 +00:00
|
|
|
validationFunc func() error,
|
2023-11-16 00:32:37 +00:00
|
|
|
) error {
|
|
|
|
cluster := cfg.Cluster(clusterName)
|
|
|
|
if cluster == nil {
|
|
|
|
return fmt.Errorf("cluster %s not found in topology", clusterName)
|
|
|
|
}
|
|
|
|
|
|
|
|
leader, err := s.Leader(cluster.Name)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error get leader: %w", err)
|
|
|
|
}
|
|
|
|
s.logger.Info("Upgrade cluster", "cluster", cluster.Name, "type", upgradeType, "leader", leader.Name)
|
|
|
|
|
|
|
|
switch upgradeType {
|
|
|
|
case UpgradeTypeAutopilot:
|
2024-01-18 22:04:06 +00:00
|
|
|
err = s.autopilotUpgrade(cfg, cluster, newServersInTopology, validationFunc)
|
2023-11-16 00:32:37 +00:00
|
|
|
case UpgradeTypeStandard:
|
2024-01-18 22:04:06 +00:00
|
|
|
err = s.standardUpgrade(cluster, targetImages, validationFunc)
|
2023-11-16 00:32:37 +00:00
|
|
|
default:
|
|
|
|
err = fmt.Errorf("upgrade type unsupported %s", upgradeType)
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error upgrading cluster: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
s.logger.Info("After upgrade", "server_nodes", cluster.ServerNodes())
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// standardUpgrade upgrades server agents in the cluster to the targetImages
|
|
|
|
// individually
|
|
|
|
func (s *Sprawl) standardUpgrade(cluster *topology.Cluster,
|
2024-01-18 22:04:06 +00:00
|
|
|
targetImages topology.Images, validationFunc func() error) error {
|
2023-11-16 00:32:37 +00:00
|
|
|
upgradeFn := func(nodeID topology.NodeID) error {
|
|
|
|
cfgUpgrade := s.Config()
|
|
|
|
clusterCopy := cfgUpgrade.Cluster(cluster.Name)
|
|
|
|
|
|
|
|
// update the server node's image
|
|
|
|
node := clusterCopy.NodeByID(nodeID)
|
|
|
|
node.Images = targetImages
|
|
|
|
s.logger.Info("Upgrading", "node", nodeID.Name, "to_version", node.Images)
|
|
|
|
err := s.RelaunchWithPhase(cfgUpgrade, LaunchPhaseUpgrade)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error relaunch for upgrade: %w", err)
|
|
|
|
}
|
|
|
|
s.logger.Info("Relaunch completed", "node", node.Name)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
s.logger.Info("Upgrade to", "version", targetImages)
|
|
|
|
|
|
|
|
// upgrade servers one at a time
|
|
|
|
for _, node := range cluster.Nodes {
|
|
|
|
if node.Kind != topology.NodeKindServer {
|
|
|
|
s.logger.Info("Skip non-server node", "node", node.Name)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if err := upgradeFn(node.ID()); err != nil {
|
|
|
|
return fmt.Errorf("error upgrading node %s: %w", node.Name, err)
|
|
|
|
}
|
2024-01-18 22:04:06 +00:00
|
|
|
|
|
|
|
// run the validation function after upgrading each server agent
|
|
|
|
if validationFunc != nil {
|
|
|
|
if err := validationFunc(); err != nil {
|
|
|
|
return fmt.Errorf("error validating cluster: %w", err)
|
|
|
|
}
|
|
|
|
}
|
2023-11-16 00:32:37 +00:00
|
|
|
}
|
2023-12-06 19:45:37 +00:00
|
|
|
|
|
|
|
// upgrade client agents one at a time
|
|
|
|
for _, node := range cluster.Nodes {
|
|
|
|
if node.Kind != topology.NodeKindClient {
|
|
|
|
s.logger.Info("Skip non-client node", "node", node.Name)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if err := upgradeFn(node.ID()); err != nil {
|
|
|
|
return fmt.Errorf("error upgrading node %s: %w", node.Name, err)
|
|
|
|
}
|
2024-01-18 22:04:06 +00:00
|
|
|
|
|
|
|
// run the validation function after upgrading each client agent
|
|
|
|
if validationFunc != nil {
|
|
|
|
if err := validationFunc(); err != nil {
|
|
|
|
return fmt.Errorf("error validating cluster: %w", err)
|
|
|
|
}
|
|
|
|
}
|
2023-12-06 19:45:37 +00:00
|
|
|
}
|
|
|
|
|
2023-11-16 00:32:37 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// autopilotUpgrade upgrades server agents by joining new servers with
|
|
|
|
// higher version. After upgrade completes, the number of server agents
|
|
|
|
// are doubled
|
2024-01-18 22:04:06 +00:00
|
|
|
func (s *Sprawl) autopilotUpgrade(cfg *topology.Config, cluster *topology.Cluster, newServersInTopology []int, validationFunc func() error) error {
|
2023-11-16 00:32:37 +00:00
|
|
|
leader, err := s.Leader(cluster.Name)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error get leader: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// sanity check for autopilot upgrade
|
|
|
|
if len(newServersInTopology) < len(cluster.ServerNodes()) {
|
|
|
|
return fmt.Errorf("insufficient new nodes for autopilot upgrade, expect %d, got %d",
|
|
|
|
len(cluster.ServerNodes()), len(newServersInTopology))
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, nodeIdx := range newServersInTopology {
|
|
|
|
node := cluster.Nodes[nodeIdx]
|
|
|
|
if node.Kind != topology.NodeKindServer {
|
|
|
|
return fmt.Errorf("node %s kind is not server", node.Name)
|
|
|
|
}
|
|
|
|
|
|
|
|
if !node.Disabled {
|
|
|
|
return fmt.Errorf("node %s is already enabled", node.Name)
|
|
|
|
}
|
|
|
|
|
|
|
|
node.Disabled = false
|
|
|
|
node.IsNewServer = true
|
|
|
|
|
|
|
|
s.logger.Info("Joining new server", "node", node.Name)
|
|
|
|
}
|
|
|
|
|
|
|
|
err = s.RelaunchWithPhase(cfg, LaunchPhaseUpgrade)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error relaunch for upgrade: %w", err)
|
|
|
|
}
|
|
|
|
s.logger.Info("Relaunch completed for autopilot upgrade")
|
|
|
|
|
|
|
|
// Verify leader is transferred - if upgrade type is autopilot
|
|
|
|
s.logger.Info("Waiting for leader transfer")
|
|
|
|
time.Sleep(20 * time.Second)
|
|
|
|
err = retry.Do(
|
|
|
|
func() error {
|
|
|
|
newLeader, err := s.Leader(cluster.Name)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error get new leader: %w", err)
|
|
|
|
}
|
|
|
|
s.logger.Info("New leader", "addr", newLeader)
|
|
|
|
|
|
|
|
if newLeader.Name == leader.Name {
|
|
|
|
return fmt.Errorf("waiting for leader transfer")
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
},
|
|
|
|
retry.MaxDelay(5*time.Second),
|
|
|
|
retry.Attempts(20),
|
|
|
|
)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Leader transfer failed: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Nodes joined the cluster, so we can set all new servers to false
|
|
|
|
for _, node := range cluster.Nodes {
|
|
|
|
node.IsNewServer = false
|
|
|
|
}
|
|
|
|
|
2024-01-18 22:04:06 +00:00
|
|
|
// Run the validation code
|
|
|
|
if validationFunc != nil {
|
|
|
|
if err := validationFunc(); err != nil {
|
|
|
|
return fmt.Errorf("error validating cluster: %w", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-16 00:32:37 +00:00
|
|
|
return nil
|
2023-11-03 16:43:43 +00:00
|
|
|
}
|
|
|
|
|
2024-01-18 22:04:06 +00:00
|
|
|
// RelaunchWithPhase relaunch the toplogy with the given phase
|
|
|
|
// and wait for the cluster to be ready (i.e, leadership is established)
|
2023-11-03 16:43:43 +00:00
|
|
|
func (s *Sprawl) RelaunchWithPhase(
|
|
|
|
cfg *topology.Config,
|
2023-11-16 00:32:37 +00:00
|
|
|
launchPhase LaunchPhase,
|
2023-07-17 22:15:22 +00:00
|
|
|
) error {
|
|
|
|
// Copy this BEFORE compiling so we capture the original definition, without denorms.
|
|
|
|
var err error
|
|
|
|
s.config, err = copyConfig(cfg)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-11-16 00:32:37 +00:00
|
|
|
s.logger = s.launchLogger.Named(launchPhase.String())
|
2023-11-03 16:43:43 +00:00
|
|
|
|
2023-07-17 22:15:22 +00:00
|
|
|
newTopology, err := topology.Recompile(s.logger.Named("recompile"), cfg, s.topology)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("topology.Compile: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
s.topology = newTopology
|
|
|
|
|
2023-10-03 15:06:50 +00:00
|
|
|
s.logger.Debug("compiled replacement topology", "ct", jd(s.topology)) // TODO
|
2023-07-17 22:15:22 +00:00
|
|
|
|
|
|
|
start := time.Now()
|
2023-11-16 00:32:37 +00:00
|
|
|
if err := s.relaunch(launchPhase); err != nil {
|
2023-07-17 22:15:22 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
s.logger.Info("topology is ready for use", "elapsed", time.Since(start))
|
|
|
|
|
|
|
|
if err := s.PrintDetails(); err != nil {
|
|
|
|
return fmt.Errorf("error gathering diagnostic details: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-11-30 17:41:30 +00:00
|
|
|
// SnapshotSaveAndRestore saves a snapshot of a cluster and then restores the snapshot
|
|
|
|
func (s *Sprawl) SnapshotSaveAndRestore(clusterName string) error {
|
2023-10-30 22:20:23 +00:00
|
|
|
cluster, ok := s.topology.Clusters[clusterName]
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("no such cluster: %s", clusterName)
|
|
|
|
}
|
|
|
|
var (
|
|
|
|
client = s.clients[cluster.Name]
|
|
|
|
)
|
|
|
|
snapshot := client.Snapshot()
|
|
|
|
snap, _, err := snapshot.Save(nil)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error saving snapshot: %w", err)
|
|
|
|
}
|
|
|
|
s.logger.Info("snapshot saved")
|
|
|
|
time.Sleep(3 * time.Second)
|
|
|
|
defer snap.Close()
|
|
|
|
|
|
|
|
// Restore the snapshot.
|
|
|
|
if err := snapshot.Restore(nil, snap); err != nil {
|
|
|
|
return fmt.Errorf("error restoring snapshot: %w", err)
|
|
|
|
}
|
|
|
|
s.logger.Info("snapshot restored")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-11-16 00:32:37 +00:00
|
|
|
func (s *Sprawl) GetKV(cluster string, key string, queryOpts *api.QueryOptions) ([]byte, error) {
|
|
|
|
client := s.clients[cluster]
|
|
|
|
kvClient := client.KV()
|
|
|
|
|
|
|
|
data, _, err := kvClient.Get(key, queryOpts)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error getting key: %w", err)
|
|
|
|
}
|
|
|
|
return data.Value, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Sprawl) LoadKVDataToCluster(cluster string, numberOfKeys int, writeOpts *api.WriteOptions) error {
|
|
|
|
client := s.clients[cluster]
|
|
|
|
kvClient := client.KV()
|
|
|
|
|
|
|
|
for i := 0; i <= numberOfKeys; i++ {
|
|
|
|
p := &api.KVPair{
|
|
|
|
Key: fmt.Sprintf("key-%d", i),
|
|
|
|
}
|
|
|
|
token := make([]byte, 131072) // 128K size of value
|
|
|
|
rand.Read(token)
|
|
|
|
p.Value = token
|
|
|
|
_, err := kvClient.Put(p, writeOpts)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error writing kv: %w", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-07-17 22:15:22 +00:00
|
|
|
// Leader returns the cluster leader agent, or an error if no leader is
|
|
|
|
// available.
|
|
|
|
func (s *Sprawl) Leader(clusterName string) (*topology.Node, error) {
|
|
|
|
cluster, ok := s.topology.Clusters[clusterName]
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("no such cluster: %s", clusterName)
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
client = s.clients[cluster.Name]
|
|
|
|
// logger = s.logger.With("cluster", cluster.Name)
|
|
|
|
)
|
|
|
|
|
|
|
|
leaderAddr, err := getLeader(client)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range cluster.Nodes {
|
|
|
|
if !node.IsServer() || node.Disabled {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if strings.HasPrefix(leaderAddr, node.LocalAddress()+":") {
|
|
|
|
return node, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, fmt.Errorf("leader not found")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Followers returns the cluster following servers.
|
|
|
|
func (s *Sprawl) Followers(clusterName string) ([]*topology.Node, error) {
|
|
|
|
cluster, ok := s.topology.Clusters[clusterName]
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("no such cluster: %s", clusterName)
|
|
|
|
}
|
|
|
|
|
|
|
|
leaderNode, err := s.Leader(clusterName)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("could not determine leader: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
var followers []*topology.Node
|
|
|
|
|
|
|
|
for _, node := range cluster.Nodes {
|
|
|
|
if !node.IsServer() || node.Disabled {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if node.ID() != leaderNode.ID() {
|
|
|
|
followers = append(followers, node)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return followers, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Sprawl) DisabledServers(clusterName string) ([]*topology.Node, error) {
|
|
|
|
cluster, ok := s.topology.Clusters[clusterName]
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("no such cluster: %s", clusterName)
|
|
|
|
}
|
|
|
|
|
|
|
|
var servers []*topology.Node
|
|
|
|
|
|
|
|
for _, node := range cluster.Nodes {
|
|
|
|
if !node.IsServer() || !node.Disabled {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
servers = append(servers, node)
|
|
|
|
}
|
|
|
|
|
|
|
|
return servers, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Sprawl) StopContainer(ctx context.Context, containerName string) error {
|
|
|
|
return s.runner.DockerExec(ctx, []string{"stop", containerName}, nil, nil)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Sprawl) SnapshotEnvoy(ctx context.Context) error {
|
|
|
|
snapDir := filepath.Join(s.workdir, "envoy-snapshots")
|
|
|
|
if err := os.MkdirAll(snapDir, 0755); err != nil {
|
|
|
|
return fmt.Errorf("could not create envoy snapshot output dir %s: %w", snapDir, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
targets := map[string]string{
|
|
|
|
"config_dump.json": "config_dump",
|
|
|
|
"clusters.json": "clusters?format=json",
|
|
|
|
"stats.txt": "stats",
|
|
|
|
"stats_prometheus.txt": "stats/prometheus",
|
|
|
|
}
|
|
|
|
|
|
|
|
var merr error
|
|
|
|
for _, c := range s.topology.Clusters {
|
|
|
|
client, err := s.HTTPClientForCluster(c.Name)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("could not get http client for cluster %q: %w", c.Name, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, n := range c.Nodes {
|
|
|
|
if n.Disabled {
|
|
|
|
continue
|
|
|
|
}
|
2023-11-10 19:22:06 +00:00
|
|
|
for _, wrk := range n.Workloads {
|
|
|
|
if wrk.Disabled || wrk.EnvoyAdminPort <= 0 {
|
2023-07-17 22:15:22 +00:00
|
|
|
continue
|
|
|
|
}
|
2023-11-10 19:22:06 +00:00
|
|
|
prefix := fmt.Sprintf("http://%s:%d", n.LocalAddress(), wrk.EnvoyAdminPort)
|
2023-07-17 22:15:22 +00:00
|
|
|
|
|
|
|
for fn, target := range targets {
|
|
|
|
u := prefix + "/" + target
|
|
|
|
|
|
|
|
body, err := scrapeURL(client, u)
|
|
|
|
if err != nil {
|
|
|
|
merr = multierror.Append(merr, fmt.Errorf("could not scrape %q for %s on %s: %w",
|
2023-11-10 19:22:06 +00:00
|
|
|
target, wrk.ID.String(), n.ID().String(), err,
|
2023-07-17 22:15:22 +00:00
|
|
|
))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2023-11-10 19:22:06 +00:00
|
|
|
outFn := filepath.Join(snapDir, n.DockerName()+"--"+wrk.ID.TFString()+"."+fn)
|
2023-07-17 22:15:22 +00:00
|
|
|
|
|
|
|
if err := os.WriteFile(outFn+".tmp", body, 0644); err != nil {
|
|
|
|
merr = multierror.Append(merr, fmt.Errorf("could not write output %q for %s on %s: %w",
|
2023-11-10 19:22:06 +00:00
|
|
|
target, wrk.ID.String(), n.ID().String(), err,
|
2023-07-17 22:15:22 +00:00
|
|
|
))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := os.Rename(outFn+".tmp", outFn); err != nil {
|
|
|
|
merr = multierror.Append(merr, fmt.Errorf("could not write output %q for %s on %s: %w",
|
2023-11-10 19:22:06 +00:00
|
|
|
target, wrk.ID.String(), n.ID().String(), err,
|
2023-07-17 22:15:22 +00:00
|
|
|
))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return merr
|
|
|
|
}
|
|
|
|
|
|
|
|
func scrapeURL(client *http.Client, url string) ([]byte, error) {
|
|
|
|
res, err := client.Get(url)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer res.Body.Close()
|
|
|
|
|
|
|
|
body, err := io.ReadAll(res.Body)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return body, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Sprawl) CaptureLogs(ctx context.Context) error {
|
|
|
|
logDir := filepath.Join(s.workdir, "logs")
|
|
|
|
if err := os.MkdirAll(logDir, 0755); err != nil {
|
|
|
|
return fmt.Errorf("could not create log output dir %s: %w", logDir, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
containers, err := s.listContainers(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-10-03 15:06:50 +00:00
|
|
|
s.logger.Debug("Capturing logs")
|
2023-07-17 22:15:22 +00:00
|
|
|
|
|
|
|
var merr error
|
|
|
|
for _, container := range containers {
|
|
|
|
if err := s.dumpContainerLogs(ctx, container, logDir); err != nil {
|
|
|
|
merr = multierror.Append(merr, fmt.Errorf("could not dump logs for container %s: %w", container, err))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return merr
|
|
|
|
}
|
|
|
|
|
|
|
|
// Dump known containers out of terraform state file.
|
|
|
|
func (s *Sprawl) listContainers(ctx context.Context) ([]string, error) {
|
|
|
|
tfdir := filepath.Join(s.workdir, "terraform")
|
|
|
|
|
|
|
|
var buf bytes.Buffer
|
|
|
|
if err := s.runner.TerraformExec(ctx, []string{"state", "list"}, &buf, tfdir); err != nil {
|
|
|
|
return nil, fmt.Errorf("error listing containers in terraform state file: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
scan = bufio.NewScanner(&buf)
|
|
|
|
containers []string
|
|
|
|
)
|
|
|
|
for scan.Scan() {
|
|
|
|
line := strings.TrimSpace(scan.Text())
|
|
|
|
|
|
|
|
name := strings.TrimPrefix(line, "docker_container.")
|
|
|
|
if name != line {
|
|
|
|
containers = append(containers, name)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if err := scan.Err(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return containers, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Sprawl) dumpContainerLogs(ctx context.Context, containerName, outputRoot string) error {
|
|
|
|
path := filepath.Join(outputRoot, containerName+".log")
|
|
|
|
|
|
|
|
f, err := os.Create(path + ".tmp")
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
keep := false
|
|
|
|
defer func() {
|
|
|
|
_ = f.Close()
|
|
|
|
if !keep {
|
|
|
|
_ = os.Remove(path + ".tmp")
|
|
|
|
_ = os.Remove(path)
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
err = s.runner.DockerExecWithStderr(
|
|
|
|
ctx,
|
|
|
|
[]string{"logs", containerName},
|
|
|
|
f,
|
|
|
|
f,
|
|
|
|
nil,
|
|
|
|
)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := f.Close(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := os.Rename(path+".tmp", path); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
keep = true
|
|
|
|
return nil
|
|
|
|
}
|