consul/test-integ/connect/snapshot_test.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package connect

import (
	"fmt"
	"io"
	"net/http"
	"net/url"
	"testing"
	"time"

	"github.com/stretchr/testify/require"

	"github.com/hashicorp/consul/api"
	"github.com/hashicorp/consul/sdk/testutil/retry"
	"github.com/hashicorp/consul/test/integration/consul-container/libs/utils"
	"github.com/hashicorp/consul/testing/deployer/sprawl/sprawltest"
	"github.com/hashicorp/consul/testing/deployer/topology"
)

// Test_Snapshot_Restore_Agentless verifies consul agent can continue
// to push envoy confgi after restoring from a snapshot.
//
//   - This test is to detect server agent frozen after restoring from a snapshot
//     (https://github.com/hashicorp/consul/pull/18636)
//
//   - This bug only appeared in agentless mode
//
// Steps:
//  1. The test spins up a one-server cluster with static-server and static-client.
//  2. A snapshot is taken and the cluster is restored from the snapshot
//  3. A new static-server replaces the old one
//  4. At the end, we assert the static-client's upstream is updated with the
//     new static-server
func Test_Snapshot_Restore_Agentless(t *testing.T) {
	t.Parallel()

	staticServerSID := topology.NewServiceID("static-server", "default", "default")
	staticClientSID := topology.NewServiceID("static-client", "default", "default")

	clu := &topology.Config{
		Images: utils.TargetImages(),
		Networks: []*topology.Network{
			{Name: "dc1"},
		},
		Clusters: []*topology.Cluster{
			{
				Name: "dc1",
				Nodes: []*topology.Node{
					{
						Kind: topology.NodeKindServer,
						// NOTE: uncomment the following lines to trigger the agent frozen bug
						// Images: topology.Images{
						// 	ConsulEnterprise: "hashicorp/consul-enterprise:1.16.1-ent",
						// },
						Name: "dc1-server1",
						Addresses: []*topology.Address{
							{Network: "dc1"},
						},
					},
					{
						Kind: topology.NodeKindDataplane,
						Name: "dc1-client1",
						Services: []*topology.Service{
							{
								ID:             staticServerSID,
								Image:          "docker.mirror.hashicorp.services/fortio/fortio",
								Port:           8080,
								EnvoyAdminPort: 19000,
								CheckTCP:       "127.0.0.1:8080",
								Command: []string{
									"server",
									"-http-port", "8080",
									"-redirect-port", "-disabled",
								},
							},
						},
					},
					{
						Kind: topology.NodeKindDataplane,
						Name: "dc1-client2",
						Services: []*topology.Service{
							{
								ID:             staticClientSID,
								Image:          "docker.mirror.hashicorp.services/fortio/fortio",
								Port:           8080,
								EnvoyAdminPort: 19000,
								CheckTCP:       "127.0.0.1:8080",
								Command: []string{
									"server",
									"-http-port", "8080",
									"-redirect-port", "-disabled",
								},
								Upstreams: []*topology.Upstream{
									{
										ID:        staticServerSID,
										LocalPort: 5000,
									},
								},
							},
						},
					},
					// Client3 for second static-server
					{
						Kind:     topology.NodeKindDataplane,
						Name:     "dc1-client3",
						Disabled: true,
						Services: []*topology.Service{
							{
								ID:             staticServerSID,
								Image:          "docker.mirror.hashicorp.services/fortio/fortio",
								Port:           8080,
								EnvoyAdminPort: 19000,
								CheckTCP:       "127.0.0.1:8080",
								Command: []string{
									"server",
									"-http-port", "8080",
									"-redirect-port", "-disabled",
								},
							},
						},
					},
				},
				Enterprise: utils.IsEnterprise(),
				InitialConfigEntries: []api.ConfigEntry{
					&api.ProxyConfigEntry{
						Kind: api.ProxyDefaults,
						Name: "global",
						Config: map[string]any{
							"protocol": "http",
						},
					},
					&api.ServiceConfigEntry{
						Kind: api.ServiceDefaults,
						Name: "static-server",
					},
					&api.ServiceIntentionsConfigEntry{
						Kind: api.ServiceIntentions,
						Name: "static-server",
						Sources: []*api.SourceIntention{
							{
								Name:   "static-client",
								Action: api.IntentionActionAllow,
							},
						},
					},
				},
			},
		},
	}
	sp := sprawltest.Launch(t, clu)

	client, err := sp.HTTPClientForCluster("dc1")
	require.NoError(t, err)

	staticClient := sp.Topology().Clusters["dc1"].ServiceByID(
		topology.NewNodeID("dc1-client2", "default"),
		staticClientSID,
	)
	staticClientAddress := fmt.Sprintf("%s:%d", staticClient.Node.LocalAddress(), staticClient.Port)

	// The following url causes the static-client's fortio server to
	// fetch the ?url= param (the upstream static-server in our case).
	url := fmt.Sprintf("http://%s/fortio/fetch2?url=%s", staticClientAddress,
		url.QueryEscape("http://localhost:5000"),
	)

	// We retry the first request until we get 200 OK since it may take a while
	// for the server to be available.
	// Use a custom retry.Timer since the default one usually times out too early.
	retrySendRequest := func(isSuccess bool) {
		t.Log("static-client sending requests to static-server...")
		retry.RunWith(&retry.Timer{Timeout: 60 * time.Second, Wait: time.Millisecond * 500}, t, func(r *retry.R) {
			resp, err := client.Post(url, "text/plain", nil)
			require.NoError(r, err)
			defer resp.Body.Close()

			if isSuccess {
				require.Equal(r, http.StatusOK, resp.StatusCode)
			} else {
				require.NotEqual(r, http.StatusOK, resp.StatusCode)
			}
			body, err := io.ReadAll(resp.Body)
			require.NoError(r, err)
			fmt.Println("Body: ", string(body), resp.StatusCode)
		})
	}
	retrySendRequest(true)
	t.Log("...ok, got 200 responses")

	t.Log("Take a snapshot of the cluster and restore ...")
	err = sp.SnapshotSave("dc1")
	require.NoError(t, err)

	// Shutdown existing static-server
	cfg := sp.Config()
	cluster := cfg.Cluster("dc1")
	cluster.Nodes[1].Disabled = true //  client 1 -- static-server
	require.NoError(t, sp.Relaunch(cfg))
	retrySendRequest(false)

	// Add a new static-server
	cfg = sp.Config()
	cluster = cfg.Cluster("dc1")
	cluster.Nodes[3].Disabled = false //  client 3 -- static-server
	require.NoError(t, sp.Relaunch(cfg))

	// Ensure the static-client connected to static-server
	retrySendRequest(true)
}