consul/sdk/testutil/server.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package testutil

// TestServer is a test helper. It uses a fork/exec model to create
// a test Consul server instance in the background and initialize it
// with some data and/or services. The test server can then be used
// to run a unit test, and offers an easy API to tear itself down
// when the test has completed. The only prerequisite is to have a consul
// binary available on the $PATH.
//
// This package does not use Consul's official API client. This is
// because we use TestServer to test the API client, which would
// otherwise cause an import cycle.

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net"
	"net/http"
	"os"
	"os/exec"
	"path/filepath"
	"runtime"
	"strings"
	"syscall"
	"testing"
	"time"

	"github.com/hashicorp/go-cleanhttp"
	"github.com/hashicorp/go-uuid"
	"github.com/hashicorp/go-version"
	"github.com/pkg/errors"

	"github.com/hashicorp/consul/sdk/freeport"
	"github.com/hashicorp/consul/sdk/testutil/retry"
)

// TestPerformanceConfig configures the performance parameters.
type TestPerformanceConfig struct {
	RaftMultiplier uint `json:"raft_multiplier,omitempty"`
}

// TestPortConfig configures the various ports used for services
// provided by the Consul server.
type TestPortConfig struct {
	DNS          int `json:"dns,omitempty"`
	HTTP         int `json:"http,omitempty"`
	HTTPS        int `json:"https,omitempty"`
	SerfLan      int `json:"serf_lan,omitempty"`
	SerfWan      int `json:"serf_wan,omitempty"`
	Server       int `json:"server,omitempty"`
	GRPC         int `json:"grpc,omitempty"`
	GRPCTLS      int `json:"grpc_tls,omitempty"`
	ProxyMinPort int `json:"proxy_min_port,omitempty"`
	ProxyMaxPort int `json:"proxy_max_port,omitempty"`
}

// TestAddressConfig contains the bind addresses for various
// components of the Consul server.
type TestAddressConfig struct {
	HTTP string `json:"http,omitempty"`
}

// TestNetworkSegment contains the configuration for a network segment.
type TestNetworkSegment struct {
	Name      string `json:"name"`
	Bind      string `json:"bind"`
	Port      int    `json:"port"`
	Advertise string `json:"advertise"`
}

// TestAudigConfig contains the configuration for Audit
type TestAuditConfig struct {
	Enabled bool `json:"enabled,omitempty"`
}

// Locality is used as the TestServerConfig's Locality.
type Locality struct {
	Region string `json:"region"`
	Zone   string `json:"zone"`
}

// TestAutopilotConfig contains the configuration for autopilot.
type TestAutopilotConfig struct {
	ServerStabilizationTime string `json:"server_stabilization_time,omitempty"`
}

// TestServerConfig is the main server configuration struct.
type TestServerConfig struct {
	NodeName            string                 `json:"node_name"`
	NodeID              string                 `json:"node_id"`
	NodeMeta            map[string]string      `json:"node_meta,omitempty"`
	NodeLocality        *Locality              `json:"locality,omitempty"`
	Performance         *TestPerformanceConfig `json:"performance,omitempty"`
	Bootstrap           bool                   `json:"bootstrap,omitempty"`
	Server              bool                   `json:"server,omitempty"`
	Partition           string                 `json:"partition,omitempty"`
	RetryJoin           []string               `json:"retry_join,omitempty"`
	DataDir             string                 `json:"data_dir,omitempty"`
	Datacenter          string                 `json:"datacenter,omitempty"`
	Segments            []TestNetworkSegment   `json:"segments"`
	DisableCheckpoint   bool                   `json:"disable_update_check"`
	LogLevel            string                 `json:"log_level,omitempty"`
	Bind                string                 `json:"bind_addr,omitempty"`
	Addresses           *TestAddressConfig     `json:"addresses,omitempty"`
	Ports               *TestPortConfig        `json:"ports,omitempty"`
	RaftProtocol        int                    `json:"raft_protocol,omitempty"`
	ACLDatacenter       string                 `json:"acl_datacenter,omitempty"`
	PrimaryDatacenter   string                 `json:"primary_datacenter,omitempty"`
	ACLDefaultPolicy    string                 `json:"acl_default_policy,omitempty"`
	ACL                 TestACLs               `json:"acl,omitempty"`
	Encrypt             string                 `json:"encrypt,omitempty"`
	CAFile              string                 `json:"ca_file,omitempty"`
	CertFile            string                 `json:"cert_file,omitempty"`
	KeyFile             string                 `json:"key_file,omitempty"`
	VerifyIncoming      bool                   `json:"verify_incoming,omitempty"`
	VerifyIncomingRPC   bool                   `json:"verify_incoming_rpc,omitempty"`
	VerifyIncomingHTTPS bool                   `json:"verify_incoming_https,omitempty"`
	VerifyOutgoing      bool                   `json:"verify_outgoing,omitempty"`
	EnableScriptChecks  bool                   `json:"enable_script_checks,omitempty"`
	Connect             map[string]interface{} `json:"connect,omitempty"`
	EnableDebug         bool                   `json:"enable_debug,omitempty"`
	SkipLeaveOnInt      bool                   `json:"skip_leave_on_interrupt"`
	Peering             *TestPeeringConfig     `json:"peering,omitempty"`
	Autopilot           *TestAutopilotConfig   `json:"autopilot,omitempty"`
	ReadyTimeout        time.Duration          `json:"-"`
	StopTimeout         time.Duration          `json:"-"`
	Stdout              io.Writer              `json:"-"`
	Stderr              io.Writer              `json:"-"`
	Args                []string               `json:"-"`
	ReturnPorts         func()                 `json:"-"`
	Audit               *TestAuditConfig       `json:"audit,omitempty"`
	Version             string                 `json:"version,omitempty"`
	Experiments         []string               `json:"experiments,omitempty"`
}

type TestACLs struct {
	Enabled             bool       `json:"enabled,omitempty"`
	TokenReplication    bool       `json:"enable_token_replication,omitempty"`
	PolicyTTL           string     `json:"policy_ttl,omitempty"`
	TokenTTL            string     `json:"token_ttl,omitempty"`
	DownPolicy          string     `json:"down_policy,omitempty"`
	DefaultPolicy       string     `json:"default_policy,omitempty"`
	EnableKeyListPolicy bool       `json:"enable_key_list_policy,omitempty"`
	Tokens              TestTokens `json:"tokens,omitempty"`
	DisabledTTL         string     `json:"disabled_ttl,omitempty"`
}

type TestTokens struct {
	Replication string `json:"replication,omitempty"`
	Default     string `json:"default,omitempty"`
	Agent       string `json:"agent,omitempty"`

	// Note: this field is marshaled as master for compatibility with
	// versions of Consul prior to 1.11.
	InitialManagement string `json:"master,omitempty"`

	// Note: this field is marshaled as agent_master for compatibility with
	// versions of Consul prior to 1.11.
	AgentRecovery string `json:"agent_master,omitempty"`
}

type TestPeeringConfig struct {
	Enabled bool `json:"enabled,omitempty"`
}

// ServerConfigCallback is a function interface which can be
// passed to NewTestServerConfig to modify the server config.
type ServerConfigCallback func(c *TestServerConfig)

// defaultServerConfig returns a new TestServerConfig struct
// with all of the listen ports incremented by one.
func defaultServerConfig(t TestingTB, consulVersion *version.Version) *TestServerConfig {
	var nodeID string
	var err error

	if id, ok := os.LookupEnv("TEST_NODE_ID"); ok {
		nodeID = id
	} else {
		nodeID, err = uuid.GenerateUUID()
		if err != nil {
			panic(err)
		}
	}

	ports := freeport.GetN(t, 7)

	logBuffer := NewLogBuffer(t)

	conf := &TestServerConfig{
		NodeName:          "node-" + nodeID,
		NodeID:            nodeID,
		DisableCheckpoint: true,
		Performance: &TestPerformanceConfig{
			RaftMultiplier: 1,
		},
		Bootstrap: true,
		Server:    true,
		LogLevel:  "debug",
		Bind:      "127.0.0.1",
		Addresses: &TestAddressConfig{},
		Ports: &TestPortConfig{
			DNS:     ports[0],
			HTTP:    ports[1],
			HTTPS:   ports[2],
			SerfLan: ports[3],
			SerfWan: ports[4],
			Server:  ports[5],
			GRPC:    ports[6],
		},
		ReadyTimeout:   10 * time.Second,
		StopTimeout:    10 * time.Second,
		SkipLeaveOnInt: true,
		Connect: map[string]interface{}{
			"enabled": true,
			"ca_config": map[string]interface{}{
				// const TestClusterID causes import cycle so hard code it here.
				"cluster_id": "11111111-2222-3333-4444-555555555555",
			},
		},
		Stdout:  logBuffer,
		Stderr:  logBuffer,
		Peering: &TestPeeringConfig{Enabled: true},
		Version: consulVersion.String(),
	}

	// Add version-specific tweaks
	if consulVersion != nil {
		// The GRPC TLS port did not exist prior to Consul 1.14
		// Including it will cause issues in older installations.
		if consulVersion.GreaterThanOrEqual(version.Must(version.NewVersion("1.14"))) {
			conf.Ports.GRPCTLS = freeport.GetOne(t)
		}
	}

	return conf
}

// TestService is used to serialize a service definition.
type TestService struct {
	ID      string   `json:",omitempty"`
	Name    string   `json:",omitempty"`
	Tags    []string `json:",omitempty"`
	Address string   `json:",omitempty"`
	Port    int      `json:",omitempty"`
}

// TestCheck is used to serialize a check definition.
type TestCheck struct {
	ID        string `json:",omitempty"`
	Name      string `json:",omitempty"`
	ServiceID string `json:",omitempty"`
	TTL       string `json:",omitempty"`
}

// TestKVResponse is what we use to decode KV data.
type TestKVResponse struct {
	Value string
}

// TestServer is the main server wrapper struct.
type TestServer struct {
	cmd    *exec.Cmd
	Config *TestServerConfig

	HTTPAddr    string
	HTTPSAddr   string
	LANAddr     string
	WANAddr     string
	ServerAddr  string
	GRPCAddr    string
	GRPCTLSAddr string

	HTTPClient *http.Client

	tmpdir string
}

// NewTestServerConfigT creates a new TestServer, and makes a call to an optional
// callback function to modify the configuration. If there is an error
// configuring or starting the server, the server will NOT be running when the
// function returns (thus you do not need to stop it).
// This function will call the `consul` binary in GOPATH.
func NewTestServerConfigT(t TestingTB, cb ServerConfigCallback) (*TestServer, error) {
	path, err := exec.LookPath("consul")
	if err != nil || path == "" {
		return nil, fmt.Errorf("consul not found on $PATH - download and install " +
			"consul or skip this test")
	}

	var tmpdir string

	if dir, ok := os.LookupEnv("TEST_TMP_DIR"); ok {
		// NOTE(CTIA): using TEST_TMP_DIR may cause conflict when NewTestServerConfigT
		// is called > 1 since two agent will uses the same directory
		tmpdir = dir
		if _, err := os.Stat(tmpdir); os.IsNotExist(err) {
			if err = os.Mkdir(tmpdir, 0750); err != nil {
				return nil, errors.Wrap(err, "failed to create tempdir from env TEST_TMP_DIR")
			}
		} else {
			t.Logf("WARNING: using tempdir that already exists %s", tmpdir)
		}
	} else {
		prefix := "consul"
		if t != nil {
			// Use test name for tmpdir if available
			prefix = strings.Replace(t.Name(), "/", "_", -1)
		}
		tmpdir, err = os.MkdirTemp("", prefix)
		if err != nil {
			return nil, errors.Wrap(err, "failed to create tempdir")
		}
	}

	consulVersion, err := findConsulVersion()
	if err != nil {
		return nil, err
	}

	datadir := filepath.Join(tmpdir, "data")
	if _, err := os.Stat(datadir); !os.IsNotExist(err) {
		t.Logf("WARNING: using a data that already exists %s", datadir)
	}
	cfg := defaultServerConfig(t, consulVersion)
	cfg.DataDir = datadir
	if cb != nil {
		cb(cfg)
	}

	b, err := json.Marshal(cfg)
	if err != nil {
		os.RemoveAll(tmpdir)
		return nil, errors.Wrap(err, "failed marshaling json")
	}

	t.Logf("CONFIG JSON: %s", string(b))
	configFile := filepath.Join(tmpdir, "config.json")
	if err := os.WriteFile(configFile, b, 0644); err != nil {
		os.RemoveAll(tmpdir)
		return nil, errors.Wrap(err, "failed writing config content")
	}

	// Start the server
	args := []string{"agent", "-config-file", configFile}
	args = append(args, cfg.Args...)
	t.Logf("test cmd args: consul args: %s", args)
	cmd := exec.Command("consul", args...)
	cmd.Stdout = cfg.Stdout
	cmd.Stderr = cfg.Stderr
	if err := cmd.Start(); err != nil {
		os.RemoveAll(tmpdir)
		return nil, errors.Wrap(err, "failed starting command")
	}

	httpAddr := fmt.Sprintf("127.0.0.1:%d", cfg.Ports.HTTP)
	client := cleanhttp.DefaultClient()
	if strings.HasPrefix(cfg.Addresses.HTTP, "unix://") {
		httpAddr = cfg.Addresses.HTTP
		tr := cleanhttp.DefaultTransport()
		tr.DialContext = func(_ context.Context, _, _ string) (net.Conn, error) {
			return net.Dial("unix", httpAddr[len("unix://"):])
		}
		client = &http.Client{Transport: tr}
	}

	server := &TestServer{
		Config: cfg,
		cmd:    cmd,

		HTTPAddr:    httpAddr,
		HTTPSAddr:   fmt.Sprintf("127.0.0.1:%d", cfg.Ports.HTTPS),
		LANAddr:     fmt.Sprintf("127.0.0.1:%d", cfg.Ports.SerfLan),
		WANAddr:     fmt.Sprintf("127.0.0.1:%d", cfg.Ports.SerfWan),
		ServerAddr:  fmt.Sprintf("127.0.0.1:%d", cfg.Ports.Server),
		GRPCAddr:    fmt.Sprintf("127.0.0.1:%d", cfg.Ports.GRPC),
		GRPCTLSAddr: fmt.Sprintf("127.0.0.1:%d", cfg.Ports.GRPCTLS),

		HTTPClient: client,

		tmpdir: tmpdir,
	}

	// Wait for the server to be ready
	if err := server.waitForAPI(); err != nil {
		if err := server.Stop(); err != nil {
			t.Logf("server stop failed with: %v", err)
		}
		return nil, err
	}

	return server, nil
}

// Stop stops the test Consul server, and removes the Consul data
// directory once we are done.
func (s *TestServer) Stop() error {
	defer func() {
		if noCleanup {
			fmt.Println("skipping cleanup because TEST_NOCLEANUP was enabled")
		} else {
			os.RemoveAll(s.tmpdir)
		}
	}()

	// There was no process
	if s.cmd == nil {
		return nil
	}

	if s.cmd.Process != nil {

		if saveSnapshot {
			fmt.Println("Saving snapshot")
			// create a snapshot prior to upgrade test
			args := []string{"snapshot", "save", "-http-addr",
				fmt.Sprintf("http://%s", s.HTTPAddr), filepath.Join(s.tmpdir, "backup.snap")}
			fmt.Printf("Saving snapshot: consul args: %s\n", args)
			cmd := exec.Command("consul", args...)
			cmd.Stdout = s.Config.Stdout
			cmd.Stderr = s.Config.Stderr
			if err := cmd.Run(); err != nil {
				return errors.Wrap(err, "failed to save a snapshot")
			}
		}

		if runtime.GOOS == "windows" {
			if err := s.cmd.Process.Kill(); err != nil {
				return errors.Wrap(err, "failed to kill consul server")
			}
		} else { // interrupt is not supported in windows
			if err := s.cmd.Process.Signal(os.Interrupt); err != nil {
				return errors.Wrap(err, "failed to kill consul server")
			}
		}
	}

	waitDone := make(chan error)
	go func() {
		waitDone <- s.cmd.Wait()
		close(waitDone)
	}()

	// wait for the process to exit to be sure that the data dir can be
	// deleted on all platforms.
	select {
	case err := <-waitDone:
		return err
	case <-time.After(s.Config.StopTimeout):
		s.cmd.Process.Signal(syscall.SIGABRT)
		<-waitDone
		return fmt.Errorf("timeout waiting for server to stop gracefully")
	}
}

// waitForAPI waits for the /status/leader HTTP endpoint to start
// responding. This is an indication that the agent has started,
// but will likely return before a leader is elected.
// Note: We do not check for a successful response status because
// we want this function to return without error even when
// there's no leader elected.
func (s *TestServer) waitForAPI() error {
	var failed bool

	// This retry replicates the logic of retry.Run to allow for nested retries.
	// By returning an error we can wrap TestServer creation with retry.Run
	// in makeClientWithConfig.
	timer := retry.TwoSeconds()
	deadline := time.Now().Add(timer.Timeout)
	for !time.Now().After(deadline) {
		time.Sleep(timer.Wait)

		url := s.url("/v1/status/leader")
		resp, err := s.privilegedGet(url)
		if err != nil {
			failed = true
			continue
		}
		resp.Body.Close()

		failed = false
	}
	if failed {
		return fmt.Errorf("api unavailable")
	}
	return nil
}

// WaitForLeader waits for the Consul server's HTTP API to become available,
// and then waits for a known leader to be observed to confirm leader election
// is done.
func (s *TestServer) WaitForLeader(t testing.TB) {
	retry.Run(t, func(r *retry.R) {
		// Query the API and check the status code.
		url := s.url("/v1/status/leader")
		resp, err := s.privilegedGet(url)
		if err != nil {
			r.Fatalf("failed http get '%s': %v", url, err)
		}
		defer resp.Body.Close()
		if err := s.requireOK(resp); err != nil {
			r.Fatalf("failed OK response: %v", err)
		}

		var leader string
		dec := json.NewDecoder(resp.Body)
		if err := dec.Decode(&leader); err != nil {
			r.Fatal(err)
		}

		// Ensure we have a leader.
		if leader == "" {
			r.Fatal("no leader address")
		}
	})
}

// WaitForVoting waits for the Consul server to become a voter in the current raft
// configuration. You probably want to adjust the ServerStablizationTime autopilot
// configuration otherwise this could take 10 seconds.
func (s *TestServer) WaitForVoting(t testing.TB) {
	// don't need to fully decode the response
	type raftServer struct {
		ID    string
		Voter bool
	}
	type raftCfgResponse struct {
		Servers []raftServer
	}

	retry.Run(t, func(r *retry.R) {
		// Query the API and get the current raft configuration.
		url := s.url("/v1/operator/raft/configuration")
		resp, err := s.privilegedGet(url)
		if err != nil {
			r.Fatalf("failed http get '%s': %v", url, err)
		}
		defer resp.Body.Close()
		if err := s.requireOK(resp); err != nil {
			r.Fatalf("failed OK response: %v", err)
		}

		var cfg raftCfgResponse
		dec := json.NewDecoder(resp.Body)
		if err := dec.Decode(&cfg); err != nil {
			r.Fatal(err)
		}

		for _, srv := range cfg.Servers {
			if srv.ID == s.Config.NodeID {
				if srv.Voter {
					return
				}
				break
			}
		}
		r.Fatalf("Server is not voting: %#v", cfg.Servers)
	})
}

// WaitForActiveCARoot waits until the server can return a Connect CA meaning
// connect has completed bootstrapping and is ready to use.
func (s *TestServer) WaitForActiveCARoot(t testing.TB) {
	// don't need to fully decode the response
	type rootsResponse struct {
		ActiveRootID string
		TrustDomain  string
		Roots        []interface{}
	}

	retry.Run(t, func(r *retry.R) {
		// Query the API and check the status code.
		url := s.url("/v1/agent/connect/ca/roots")
		resp, err := s.privilegedGet(url)
		if err != nil {
			r.Fatalf("failed http get '%s': %v", url, err)
		}
		defer resp.Body.Close()
		// Roots will return an error status until it's been bootstrapped. We could
		// parse the body and sanity check but that causes either import cycles
		// since this is used in both `api` and consul test or duplication. The 200
		// is all we really need to wait for.
		if err := s.requireOK(resp); err != nil {
			r.Fatalf("failed OK response: %v", err)
		}

		var roots rootsResponse

		dec := json.NewDecoder(resp.Body)
		if err := dec.Decode(&roots); err != nil {
			r.Fatal(err)
		}

		if roots.ActiveRootID == "" || len(roots.Roots) < 1 {
			r.Fatalf("/v1/agent/connect/ca/roots returned 200 but without roots: %+v", roots)
		}
	})
}

// WaitForServiceIntentions waits until the server can accept config entry
// kinds of service-intentions meaning any migration bootstrapping from pre-1.9
// intentions has completed.
func (s *TestServer) WaitForServiceIntentions(t testing.TB) {
	const fakeConfigName = "Sa4ohw5raith4si0Ohwuqu3lowiethoh"
	retry.Run(t, func(r *retry.R) {
		// Try to delete a non-existent service-intentions config entry. The
		// preflightCheck call in agent/consul/config_endpoint.go will fail if
		// we aren't ready yet, vs just doing no work instead.
		url := s.url("/v1/config/service-intentions/" + fakeConfigName)
		resp, err := s.privilegedDelete(url)
		if err != nil {
			r.Fatalf("failed http get '%s': %v", url, err)
		}
		defer resp.Body.Close()
		if err := s.requireOK(resp); err != nil {
			r.Fatalf("failed OK response: %v", err)
		}
	})
}

// WaitForSerfCheck ensures we have a node with serfHealth check registered
// Behavior mirrors testrpc.WaitForTestAgent but avoids the dependency cycle in api pkg
func (s *TestServer) WaitForSerfCheck(t testing.TB) {
	retry.Run(t, func(r *retry.R) {
		// Query the API and check the status code.
		url := s.url("/v1/catalog/nodes?index=0")
		resp, err := s.privilegedGet(url)
		if err != nil {
			r.Fatalf("failed http get: %v", err)
		}
		defer resp.Body.Close()
		if err := s.requireOK(resp); err != nil {
			r.Fatalf("failed OK response: %v", err)
		}

		// Watch for the anti-entropy sync to finish.
		var payload []map[string]interface{}
		dec := json.NewDecoder(resp.Body)
		if err := dec.Decode(&payload); err != nil {
			r.Fatal(err)
		}
		if len(payload) < 1 {
			r.Fatal("No nodes")
		}

		// Ensure the serfHealth check is registered
		url = s.url(fmt.Sprintf("/v1/health/node/%s", payload[0]["Node"]))
		resp, err = s.privilegedGet(url)
		if err != nil {
			r.Fatalf("failed http get: %v", err)
		}
		defer resp.Body.Close()
		if err := s.requireOK(resp); err != nil {
			r.Fatalf("failed OK response: %v", err)
		}
		dec = json.NewDecoder(resp.Body)
		if err = dec.Decode(&payload); err != nil {
			r.Fatal(err)
		}

		var found bool
		for _, check := range payload {
			if check["CheckID"].(string) == "serfHealth" {
				found = true
				break
			}
		}
		if !found {
			r.Fatal("missing serfHealth registration")
		}
	})
}

func (s *TestServer) privilegedGet(url string) (*http.Response, error) {
	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
		return nil, err
	}
	if s.Config.ACL.Tokens.InitialManagement != "" {
		req.Header.Set("x-consul-token", s.Config.ACL.Tokens.InitialManagement)
	}
	return s.HTTPClient.Do(req)
}

func (s *TestServer) privilegedDelete(url string) (*http.Response, error) {
	req, err := http.NewRequest("DELETE", url, nil)
	if err != nil {
		return nil, err
	}
	if s.Config.ACL.Tokens.InitialManagement != "" {
		req.Header.Set("x-consul-token", s.Config.ACL.Tokens.InitialManagement)
	}
	return s.HTTPClient.Do(req)
}

func findConsulVersion() (*version.Version, error) {
	cmd := exec.Command("consul", "version", "-format=json")
	var stdout, stderr bytes.Buffer
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr
	if err := cmd.Start(); err != nil {
		return nil, errors.Wrap(err, "failed to get consul version")
	}
	cmd.Wait()
	type consulVersion struct {
		Version string
	}
	v := consulVersion{}
	if err := json.Unmarshal(stdout.Bytes(), &v); err != nil {
		return nil, errors.Wrap(err, "error parsing consul version json")
	}
	parsed, err := version.NewVersion(v.Version)
	if err != nil {
		return nil, errors.Wrap(err, "error parsing consul version")
	}
	return parsed, nil
}