From 4b26ed8eba99e8a0aa23d663e815ed54f5e9f068 Mon Sep 17 00:00:00 2001 From: cskh Date: Mon, 30 Oct 2023 18:20:23 -0400 Subject: [PATCH] integ test: snapshot mesh frozen bug test (#19435) * integ test: snapshot mesh frozen bug test --- .github/workflows/test-integrations.yml | 79 ++++++++ test-integ/connect/snapshot_test.go | 212 ++++++++++++++++++++ testing/deployer/sprawl/boot.go | 7 +- testing/deployer/sprawl/sprawl.go | 26 +++ testing/deployer/topology/default_consul.go | 4 +- testing/deployer/topology/topology.go | 4 +- 6 files changed, 325 insertions(+), 7 deletions(-) create mode 100644 test-integ/connect/snapshot_test.go diff --git a/.github/workflows/test-integrations.yml b/.github/workflows/test-integrations.yml index 64da185db6..ef89176536 100644 --- a/.github/workflows/test-integrations.yml +++ b/.github/workflows/test-integrations.yml @@ -487,6 +487,84 @@ jobs: DD_ENV: ci run: datadog-ci junit upload --service "$GITHUB_REPOSITORY" $TEST_RESULTS_DIR/results.xml + integration-test-with-deployer: + runs-on: ${{ fromJSON(needs.setup.outputs.compute-large ) }} + needs: + - setup + permissions: + id-token: write # NOTE: this permission is explicitly required for Vault auth. + contents: read + strategy: + fail-fast: false + steps: + - name: Checkout code + uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3 + # NOTE: This step is specifically needed for ENT. It allows us to access the required private HashiCorp repos. + - name: Setup Git + if: ${{ endsWith(github.repository, '-enterprise') }} + run: git config --global url."https://${{ secrets.ELEVATED_GITHUB_TOKEN }}:@github.com".insteadOf "https://github.com" + - uses: actions/setup-go@fac708d6674e30b6ba41289acaab6d4b75aa0753 # v4.0.1 + with: + go-version-file: 'go.mod' + - run: go env + - name: Build image + run: make test-compat-integ-setup + - name: Integration Tests + run: | + mkdir -p "${{ env.TEST_RESULTS_DIR }}" + export NOLOGBUFFER=1 + cd ./test-integ/connect + go run gotest.tools/gotestsum@v${{env.GOTESTSUM_VERSION}} \ + --raw-command \ + --format=standard-verbose \ + --debug \ + --packages="./..." \ + -- \ + go test \ + -tags "${{ env.GOTAGS }}" \ + -timeout=20m \ + -parallel=2 \ + -json . \ + --target-image ${{ env.CONSUL_LATEST_IMAGE_NAME }} \ + --target-version local + env: + # this is needed because of incompatibility between RYUK container and GHA + GOTESTSUM_JUNITFILE: ${{ env.TEST_RESULTS_DIR }}/results.xml + GOTESTSUM_FORMAT: standard-verbose + COMPOSE_INTERACTIVE_NO_CLI: 1 + # tput complains if this isn't set to something. + TERM: ansi + # NOTE: ENT specific step as we store secrets in Vault. + - name: Authenticate to Vault + if: ${{ endsWith(github.repository, '-enterprise') }} + id: vault-auth + run: vault-auth + + # NOTE: ENT specific step as we store secrets in Vault. + - name: Fetch Secrets + if: ${{ endsWith(github.repository, '-enterprise') }} + id: secrets + uses: hashicorp/vault-action@v2.5.0 + with: + url: ${{ steps.vault-auth.outputs.addr }} + caCertificate: ${{ steps.vault-auth.outputs.ca_certificate }} + token: ${{ steps.vault-auth.outputs.token }} + secrets: | + kv/data/github/${{ github.repository }}/datadog apikey | DATADOG_API_KEY; + + - name: prepare datadog-ci + if: ${{ !endsWith(github.repository, '-enterprise') }} + run: | + curl -L --fail "https://github.com/DataDog/datadog-ci/releases/latest/download/datadog-ci_linux-x64" --output "/usr/local/bin/datadog-ci" + chmod +x /usr/local/bin/datadog-ci + + - name: upload coverage + # do not run on forks + if: github.event.pull_request.head.repo.full_name == github.repository + env: + DATADOG_API_KEY: "${{ endsWith(github.repository, '-enterprise') && env.DATADOG_API_KEY || secrets.DATADOG_API_KEY }}" + DD_ENV: ci + run: datadog-ci junit upload --service "$GITHUB_REPOSITORY" $TEST_RESULTS_DIR/results.xml test-integrations-success: needs: @@ -498,6 +576,7 @@ jobs: - generate-envoy-job-matrices - envoy-integration-test - compatibility-integration-test + - integration-test-with-deployer runs-on: ${{ fromJSON(needs.setup.outputs.compute-small) }} if: always() && needs.conditional-skip.outputs.skip-ci != 'true' steps: diff --git a/test-integ/connect/snapshot_test.go b/test-integ/connect/snapshot_test.go new file mode 100644 index 0000000000..469562a160 --- /dev/null +++ b/test-integ/connect/snapshot_test.go @@ -0,0 +1,212 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package connect + +import ( + "fmt" + "io" + "net/http" + "net/url" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/hashicorp/consul/api" + "github.com/hashicorp/consul/sdk/testutil/retry" + "github.com/hashicorp/consul/test/integration/consul-container/libs/utils" + "github.com/hashicorp/consul/testing/deployer/sprawl/sprawltest" + "github.com/hashicorp/consul/testing/deployer/topology" +) + +// Test_Snapshot_Restore_Agentless verifies consul agent can continue +// to push envoy confgi after restoring from a snapshot. +// +// - This test is to detect server agent frozen after restoring from a snapshot +// (https://github.com/hashicorp/consul/pull/18636) +// +// - This bug only appeared in agentless mode +// +// Steps: +// 1. The test spins up a one-server cluster with static-server and static-client. +// 2. A snapshot is taken and the cluster is restored from the snapshot +// 3. A new static-server replaces the old one +// 4. At the end, we assert the static-client's upstream is updated with the +// new static-server +func Test_Snapshot_Restore_Agentless(t *testing.T) { + t.Parallel() + + staticServerSID := topology.NewServiceID("static-server", "default", "default") + staticClientSID := topology.NewServiceID("static-client", "default", "default") + + clu := &topology.Config{ + Images: utils.TargetImages(), + Networks: []*topology.Network{ + {Name: "dc1"}, + }, + Clusters: []*topology.Cluster{ + { + Name: "dc1", + Nodes: []*topology.Node{ + { + Kind: topology.NodeKindServer, + // NOTE: uncomment the following lines to trigger the agent frozen bug + // Images: topology.Images{ + // ConsulEnterprise: "hashicorp/consul-enterprise:1.16.1-ent", + // }, + Name: "dc1-server1", + Addresses: []*topology.Address{ + {Network: "dc1"}, + }, + }, + { + Kind: topology.NodeKindDataplane, + Name: "dc1-client1", + Services: []*topology.Service{ + { + ID: staticServerSID, + Image: "docker.mirror.hashicorp.services/fortio/fortio", + Port: 8080, + EnvoyAdminPort: 19000, + CheckTCP: "127.0.0.1:8080", + Command: []string{ + "server", + "-http-port", "8080", + "-redirect-port", "-disabled", + }, + }, + }, + }, + { + Kind: topology.NodeKindDataplane, + Name: "dc1-client2", + Services: []*topology.Service{ + { + ID: staticClientSID, + Image: "docker.mirror.hashicorp.services/fortio/fortio", + Port: 8080, + EnvoyAdminPort: 19000, + CheckTCP: "127.0.0.1:8080", + Command: []string{ + "server", + "-http-port", "8080", + "-redirect-port", "-disabled", + }, + Upstreams: []*topology.Upstream{ + { + ID: staticServerSID, + LocalPort: 5000, + }, + }, + }, + }, + }, + // Client3 for second static-server + { + Kind: topology.NodeKindDataplane, + Name: "dc1-client3", + Disabled: true, + Services: []*topology.Service{ + { + ID: staticServerSID, + Image: "docker.mirror.hashicorp.services/fortio/fortio", + Port: 8080, + EnvoyAdminPort: 19000, + CheckTCP: "127.0.0.1:8080", + Command: []string{ + "server", + "-http-port", "8080", + "-redirect-port", "-disabled", + }, + }, + }, + }, + }, + Enterprise: utils.IsEnterprise(), + InitialConfigEntries: []api.ConfigEntry{ + &api.ProxyConfigEntry{ + Kind: api.ProxyDefaults, + Name: "global", + Config: map[string]any{ + "protocol": "http", + }, + }, + &api.ServiceConfigEntry{ + Kind: api.ServiceDefaults, + Name: "static-server", + }, + &api.ServiceIntentionsConfigEntry{ + Kind: api.ServiceIntentions, + Name: "static-server", + Sources: []*api.SourceIntention{ + { + Name: "static-client", + Action: api.IntentionActionAllow, + }, + }, + }, + }, + }, + }, + } + sp := sprawltest.Launch(t, clu) + + client, err := sp.HTTPClientForCluster("dc1") + require.NoError(t, err) + + staticClient := sp.Topology().Clusters["dc1"].ServiceByID( + topology.NewNodeID("dc1-client2", "default"), + staticClientSID, + ) + staticClientAddress := fmt.Sprintf("%s:%d", staticClient.Node.LocalAddress(), staticClient.Port) + + // The following url causes the static-client's fortio server to + // fetch the ?url= param (the upstream static-server in our case). + url := fmt.Sprintf("http://%s/fortio/fetch2?url=%s", staticClientAddress, + url.QueryEscape("http://localhost:5000"), + ) + + // We retry the first request until we get 200 OK since it may take a while + // for the server to be available. + // Use a custom retry.Timer since the default one usually times out too early. + retrySendRequest := func(isSuccess bool) { + t.Log("static-client sending requests to static-server...") + retry.RunWith(&retry.Timer{Timeout: 60 * time.Second, Wait: time.Millisecond * 500}, t, func(r *retry.R) { + resp, err := client.Post(url, "text/plain", nil) + require.NoError(r, err) + defer resp.Body.Close() + + if isSuccess { + require.Equal(r, http.StatusOK, resp.StatusCode) + } else { + require.NotEqual(r, http.StatusOK, resp.StatusCode) + } + body, err := io.ReadAll(resp.Body) + require.NoError(r, err) + fmt.Println("Body: ", string(body), resp.StatusCode) + }) + } + retrySendRequest(true) + t.Log("...ok, got 200 responses") + + t.Log("Take a snapshot of the cluster and restore ...") + err = sp.SnapshotSave("dc1") + require.NoError(t, err) + + // Shutdown existing static-server + cfg := sp.Config() + cluster := cfg.Cluster("dc1") + cluster.Nodes[1].Disabled = true // client 1 -- static-server + require.NoError(t, sp.Relaunch(cfg)) + retrySendRequest(false) + + // Add a new static-server + cfg = sp.Config() + cluster = cfg.Cluster("dc1") + cluster.Nodes[3].Disabled = false // client 3 -- static-server + require.NoError(t, sp.Relaunch(cfg)) + + // Ensure the static-client connected to static-server + retrySendRequest(true) +} diff --git a/testing/deployer/sprawl/boot.go b/testing/deployer/sprawl/boot.go index 89c887c757..37656ca0ad 100644 --- a/testing/deployer/sprawl/boot.go +++ b/testing/deployer/sprawl/boot.go @@ -180,6 +180,7 @@ func (s *Sprawl) assignIPAddresses() error { return fmt.Errorf("unknown network %q", addr.Network) } addr.IPAddress = net.IPByIndex(node.Index) + s.logger.Info("assign addr", "node", node.Name, "addr", addr.IPAddress) } } } @@ -504,7 +505,7 @@ func (s *Sprawl) waitForClientAntiEntropyOnce(cluster *topology.Cluster) error { logger.Debug("all nodes have posted node updates, so first anti-entropy has happened", "elapsed", dur) return nil } - logger.Debug("not all client nodes have posted node updates yet", "nodes", stragglers) + logger.Debug("not all nodes have posted node updates yet", "nodes", stragglers) time.Sleep(1 * time.Second) } @@ -514,10 +515,10 @@ func newGossipKey() (string, error) { key := make([]byte, 16) n, err := rand.Reader.Read(key) if err != nil { - return "", fmt.Errorf("Error reading random data: %s", err) + return "", fmt.Errorf("error reading random data: %s", err) } if n != 16 { - return "", fmt.Errorf("Couldn't read enough entropy. Generate more entropy!") + return "", fmt.Errorf("couldn't read enough entropy. Generate more entropy") } return base64.StdEncoding.EncodeToString(key), nil } diff --git a/testing/deployer/sprawl/sprawl.go b/testing/deployer/sprawl/sprawl.go index 3433e1b3dc..16743b8b7b 100644 --- a/testing/deployer/sprawl/sprawl.go +++ b/testing/deployer/sprawl/sprawl.go @@ -235,6 +235,32 @@ func (s *Sprawl) Relaunch( return nil } +// SnapshotSave saves a snapshot of a cluster and restore with the snapshot +func (s *Sprawl) SnapshotSave(clusterName string) error { + cluster, ok := s.topology.Clusters[clusterName] + if !ok { + return fmt.Errorf("no such cluster: %s", clusterName) + } + var ( + client = s.clients[cluster.Name] + ) + snapshot := client.Snapshot() + snap, _, err := snapshot.Save(nil) + if err != nil { + return fmt.Errorf("error saving snapshot: %w", err) + } + s.logger.Info("snapshot saved") + time.Sleep(3 * time.Second) + defer snap.Close() + + // Restore the snapshot. + if err := snapshot.Restore(nil, snap); err != nil { + return fmt.Errorf("error restoring snapshot: %w", err) + } + s.logger.Info("snapshot restored") + return nil +} + // Leader returns the cluster leader agent, or an error if no leader is // available. func (s *Sprawl) Leader(clusterName string) (*topology.Node, error) { diff --git a/testing/deployer/topology/default_consul.go b/testing/deployer/topology/default_consul.go index f9542f1664..3a259d12d4 100644 --- a/testing/deployer/topology/default_consul.go +++ b/testing/deployer/topology/default_consul.go @@ -3,5 +3,5 @@ package topology -const DefaultConsulImage = "hashicorp/consul:1.15.2" -const DefaultConsulEnterpriseImage = "hashicorp/consul-enterprise:1.15.2-ent" +const DefaultConsulImage = "hashicorp/consul:1.16.2" +const DefaultConsulEnterpriseImage = "hashicorp/consul-enterprise:1.16.2-ent" diff --git a/testing/deployer/topology/topology.go b/testing/deployer/topology/topology.go index a71dbaa451..2662329aee 100644 --- a/testing/deployer/topology/topology.go +++ b/testing/deployer/topology/topology.go @@ -515,7 +515,7 @@ func (n *Node) LocalAddress() string { for _, a := range n.Addresses { if a.IsLocal() { if a.IPAddress == "" { - panic("node has no assigned local address") + panic("node has no assigned local address: " + n.Name) } return a.IPAddress } @@ -538,7 +538,7 @@ func (n *Node) LocalProxyPort() int { if a.ProxyPort > 0 { return a.ProxyPort } - panic("node has no assigned local address") + panic("node has no assigned local address: " + n.Name) } } panic("node has no local network")