mirror of https://github.com/status-im/consul.git
integ test: snapshot mesh frozen bug test (#19435)
* integ test: snapshot mesh frozen bug test
This commit is contained in:
parent
e18901ba88
commit
4b26ed8eba
|
@ -487,6 +487,84 @@ jobs:
|
||||||
DD_ENV: ci
|
DD_ENV: ci
|
||||||
run: datadog-ci junit upload --service "$GITHUB_REPOSITORY" $TEST_RESULTS_DIR/results.xml
|
run: datadog-ci junit upload --service "$GITHUB_REPOSITORY" $TEST_RESULTS_DIR/results.xml
|
||||||
|
|
||||||
|
integration-test-with-deployer:
|
||||||
|
runs-on: ${{ fromJSON(needs.setup.outputs.compute-large ) }}
|
||||||
|
needs:
|
||||||
|
- setup
|
||||||
|
permissions:
|
||||||
|
id-token: write # NOTE: this permission is explicitly required for Vault auth.
|
||||||
|
contents: read
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
|
||||||
|
# NOTE: This step is specifically needed for ENT. It allows us to access the required private HashiCorp repos.
|
||||||
|
- name: Setup Git
|
||||||
|
if: ${{ endsWith(github.repository, '-enterprise') }}
|
||||||
|
run: git config --global url."https://${{ secrets.ELEVATED_GITHUB_TOKEN }}:@github.com".insteadOf "https://github.com"
|
||||||
|
- uses: actions/setup-go@fac708d6674e30b6ba41289acaab6d4b75aa0753 # v4.0.1
|
||||||
|
with:
|
||||||
|
go-version-file: 'go.mod'
|
||||||
|
- run: go env
|
||||||
|
- name: Build image
|
||||||
|
run: make test-compat-integ-setup
|
||||||
|
- name: Integration Tests
|
||||||
|
run: |
|
||||||
|
mkdir -p "${{ env.TEST_RESULTS_DIR }}"
|
||||||
|
export NOLOGBUFFER=1
|
||||||
|
cd ./test-integ/connect
|
||||||
|
go run gotest.tools/gotestsum@v${{env.GOTESTSUM_VERSION}} \
|
||||||
|
--raw-command \
|
||||||
|
--format=standard-verbose \
|
||||||
|
--debug \
|
||||||
|
--packages="./..." \
|
||||||
|
-- \
|
||||||
|
go test \
|
||||||
|
-tags "${{ env.GOTAGS }}" \
|
||||||
|
-timeout=20m \
|
||||||
|
-parallel=2 \
|
||||||
|
-json . \
|
||||||
|
--target-image ${{ env.CONSUL_LATEST_IMAGE_NAME }} \
|
||||||
|
--target-version local
|
||||||
|
env:
|
||||||
|
# this is needed because of incompatibility between RYUK container and GHA
|
||||||
|
GOTESTSUM_JUNITFILE: ${{ env.TEST_RESULTS_DIR }}/results.xml
|
||||||
|
GOTESTSUM_FORMAT: standard-verbose
|
||||||
|
COMPOSE_INTERACTIVE_NO_CLI: 1
|
||||||
|
# tput complains if this isn't set to something.
|
||||||
|
TERM: ansi
|
||||||
|
# NOTE: ENT specific step as we store secrets in Vault.
|
||||||
|
- name: Authenticate to Vault
|
||||||
|
if: ${{ endsWith(github.repository, '-enterprise') }}
|
||||||
|
id: vault-auth
|
||||||
|
run: vault-auth
|
||||||
|
|
||||||
|
# NOTE: ENT specific step as we store secrets in Vault.
|
||||||
|
- name: Fetch Secrets
|
||||||
|
if: ${{ endsWith(github.repository, '-enterprise') }}
|
||||||
|
id: secrets
|
||||||
|
uses: hashicorp/vault-action@v2.5.0
|
||||||
|
with:
|
||||||
|
url: ${{ steps.vault-auth.outputs.addr }}
|
||||||
|
caCertificate: ${{ steps.vault-auth.outputs.ca_certificate }}
|
||||||
|
token: ${{ steps.vault-auth.outputs.token }}
|
||||||
|
secrets: |
|
||||||
|
kv/data/github/${{ github.repository }}/datadog apikey | DATADOG_API_KEY;
|
||||||
|
|
||||||
|
- name: prepare datadog-ci
|
||||||
|
if: ${{ !endsWith(github.repository, '-enterprise') }}
|
||||||
|
run: |
|
||||||
|
curl -L --fail "https://github.com/DataDog/datadog-ci/releases/latest/download/datadog-ci_linux-x64" --output "/usr/local/bin/datadog-ci"
|
||||||
|
chmod +x /usr/local/bin/datadog-ci
|
||||||
|
|
||||||
|
- name: upload coverage
|
||||||
|
# do not run on forks
|
||||||
|
if: github.event.pull_request.head.repo.full_name == github.repository
|
||||||
|
env:
|
||||||
|
DATADOG_API_KEY: "${{ endsWith(github.repository, '-enterprise') && env.DATADOG_API_KEY || secrets.DATADOG_API_KEY }}"
|
||||||
|
DD_ENV: ci
|
||||||
|
run: datadog-ci junit upload --service "$GITHUB_REPOSITORY" $TEST_RESULTS_DIR/results.xml
|
||||||
|
|
||||||
test-integrations-success:
|
test-integrations-success:
|
||||||
needs:
|
needs:
|
||||||
|
@ -498,6 +576,7 @@ jobs:
|
||||||
- generate-envoy-job-matrices
|
- generate-envoy-job-matrices
|
||||||
- envoy-integration-test
|
- envoy-integration-test
|
||||||
- compatibility-integration-test
|
- compatibility-integration-test
|
||||||
|
- integration-test-with-deployer
|
||||||
runs-on: ${{ fromJSON(needs.setup.outputs.compute-small) }}
|
runs-on: ${{ fromJSON(needs.setup.outputs.compute-small) }}
|
||||||
if: always() && needs.conditional-skip.outputs.skip-ci != 'true'
|
if: always() && needs.conditional-skip.outputs.skip-ci != 'true'
|
||||||
steps:
|
steps:
|
||||||
|
|
|
@ -0,0 +1,212 @@
|
||||||
|
// Copyright (c) HashiCorp, Inc.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package connect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/hashicorp/consul/api"
|
||||||
|
"github.com/hashicorp/consul/sdk/testutil/retry"
|
||||||
|
"github.com/hashicorp/consul/test/integration/consul-container/libs/utils"
|
||||||
|
"github.com/hashicorp/consul/testing/deployer/sprawl/sprawltest"
|
||||||
|
"github.com/hashicorp/consul/testing/deployer/topology"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Test_Snapshot_Restore_Agentless verifies consul agent can continue
|
||||||
|
// to push envoy confgi after restoring from a snapshot.
|
||||||
|
//
|
||||||
|
// - This test is to detect server agent frozen after restoring from a snapshot
|
||||||
|
// (https://github.com/hashicorp/consul/pull/18636)
|
||||||
|
//
|
||||||
|
// - This bug only appeared in agentless mode
|
||||||
|
//
|
||||||
|
// Steps:
|
||||||
|
// 1. The test spins up a one-server cluster with static-server and static-client.
|
||||||
|
// 2. A snapshot is taken and the cluster is restored from the snapshot
|
||||||
|
// 3. A new static-server replaces the old one
|
||||||
|
// 4. At the end, we assert the static-client's upstream is updated with the
|
||||||
|
// new static-server
|
||||||
|
func Test_Snapshot_Restore_Agentless(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
staticServerSID := topology.NewServiceID("static-server", "default", "default")
|
||||||
|
staticClientSID := topology.NewServiceID("static-client", "default", "default")
|
||||||
|
|
||||||
|
clu := &topology.Config{
|
||||||
|
Images: utils.TargetImages(),
|
||||||
|
Networks: []*topology.Network{
|
||||||
|
{Name: "dc1"},
|
||||||
|
},
|
||||||
|
Clusters: []*topology.Cluster{
|
||||||
|
{
|
||||||
|
Name: "dc1",
|
||||||
|
Nodes: []*topology.Node{
|
||||||
|
{
|
||||||
|
Kind: topology.NodeKindServer,
|
||||||
|
// NOTE: uncomment the following lines to trigger the agent frozen bug
|
||||||
|
// Images: topology.Images{
|
||||||
|
// ConsulEnterprise: "hashicorp/consul-enterprise:1.16.1-ent",
|
||||||
|
// },
|
||||||
|
Name: "dc1-server1",
|
||||||
|
Addresses: []*topology.Address{
|
||||||
|
{Network: "dc1"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Kind: topology.NodeKindDataplane,
|
||||||
|
Name: "dc1-client1",
|
||||||
|
Services: []*topology.Service{
|
||||||
|
{
|
||||||
|
ID: staticServerSID,
|
||||||
|
Image: "docker.mirror.hashicorp.services/fortio/fortio",
|
||||||
|
Port: 8080,
|
||||||
|
EnvoyAdminPort: 19000,
|
||||||
|
CheckTCP: "127.0.0.1:8080",
|
||||||
|
Command: []string{
|
||||||
|
"server",
|
||||||
|
"-http-port", "8080",
|
||||||
|
"-redirect-port", "-disabled",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Kind: topology.NodeKindDataplane,
|
||||||
|
Name: "dc1-client2",
|
||||||
|
Services: []*topology.Service{
|
||||||
|
{
|
||||||
|
ID: staticClientSID,
|
||||||
|
Image: "docker.mirror.hashicorp.services/fortio/fortio",
|
||||||
|
Port: 8080,
|
||||||
|
EnvoyAdminPort: 19000,
|
||||||
|
CheckTCP: "127.0.0.1:8080",
|
||||||
|
Command: []string{
|
||||||
|
"server",
|
||||||
|
"-http-port", "8080",
|
||||||
|
"-redirect-port", "-disabled",
|
||||||
|
},
|
||||||
|
Upstreams: []*topology.Upstream{
|
||||||
|
{
|
||||||
|
ID: staticServerSID,
|
||||||
|
LocalPort: 5000,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Client3 for second static-server
|
||||||
|
{
|
||||||
|
Kind: topology.NodeKindDataplane,
|
||||||
|
Name: "dc1-client3",
|
||||||
|
Disabled: true,
|
||||||
|
Services: []*topology.Service{
|
||||||
|
{
|
||||||
|
ID: staticServerSID,
|
||||||
|
Image: "docker.mirror.hashicorp.services/fortio/fortio",
|
||||||
|
Port: 8080,
|
||||||
|
EnvoyAdminPort: 19000,
|
||||||
|
CheckTCP: "127.0.0.1:8080",
|
||||||
|
Command: []string{
|
||||||
|
"server",
|
||||||
|
"-http-port", "8080",
|
||||||
|
"-redirect-port", "-disabled",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Enterprise: utils.IsEnterprise(),
|
||||||
|
InitialConfigEntries: []api.ConfigEntry{
|
||||||
|
&api.ProxyConfigEntry{
|
||||||
|
Kind: api.ProxyDefaults,
|
||||||
|
Name: "global",
|
||||||
|
Config: map[string]any{
|
||||||
|
"protocol": "http",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
&api.ServiceConfigEntry{
|
||||||
|
Kind: api.ServiceDefaults,
|
||||||
|
Name: "static-server",
|
||||||
|
},
|
||||||
|
&api.ServiceIntentionsConfigEntry{
|
||||||
|
Kind: api.ServiceIntentions,
|
||||||
|
Name: "static-server",
|
||||||
|
Sources: []*api.SourceIntention{
|
||||||
|
{
|
||||||
|
Name: "static-client",
|
||||||
|
Action: api.IntentionActionAllow,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
sp := sprawltest.Launch(t, clu)
|
||||||
|
|
||||||
|
client, err := sp.HTTPClientForCluster("dc1")
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
staticClient := sp.Topology().Clusters["dc1"].ServiceByID(
|
||||||
|
topology.NewNodeID("dc1-client2", "default"),
|
||||||
|
staticClientSID,
|
||||||
|
)
|
||||||
|
staticClientAddress := fmt.Sprintf("%s:%d", staticClient.Node.LocalAddress(), staticClient.Port)
|
||||||
|
|
||||||
|
// The following url causes the static-client's fortio server to
|
||||||
|
// fetch the ?url= param (the upstream static-server in our case).
|
||||||
|
url := fmt.Sprintf("http://%s/fortio/fetch2?url=%s", staticClientAddress,
|
||||||
|
url.QueryEscape("http://localhost:5000"),
|
||||||
|
)
|
||||||
|
|
||||||
|
// We retry the first request until we get 200 OK since it may take a while
|
||||||
|
// for the server to be available.
|
||||||
|
// Use a custom retry.Timer since the default one usually times out too early.
|
||||||
|
retrySendRequest := func(isSuccess bool) {
|
||||||
|
t.Log("static-client sending requests to static-server...")
|
||||||
|
retry.RunWith(&retry.Timer{Timeout: 60 * time.Second, Wait: time.Millisecond * 500}, t, func(r *retry.R) {
|
||||||
|
resp, err := client.Post(url, "text/plain", nil)
|
||||||
|
require.NoError(r, err)
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if isSuccess {
|
||||||
|
require.Equal(r, http.StatusOK, resp.StatusCode)
|
||||||
|
} else {
|
||||||
|
require.NotEqual(r, http.StatusOK, resp.StatusCode)
|
||||||
|
}
|
||||||
|
body, err := io.ReadAll(resp.Body)
|
||||||
|
require.NoError(r, err)
|
||||||
|
fmt.Println("Body: ", string(body), resp.StatusCode)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
retrySendRequest(true)
|
||||||
|
t.Log("...ok, got 200 responses")
|
||||||
|
|
||||||
|
t.Log("Take a snapshot of the cluster and restore ...")
|
||||||
|
err = sp.SnapshotSave("dc1")
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Shutdown existing static-server
|
||||||
|
cfg := sp.Config()
|
||||||
|
cluster := cfg.Cluster("dc1")
|
||||||
|
cluster.Nodes[1].Disabled = true // client 1 -- static-server
|
||||||
|
require.NoError(t, sp.Relaunch(cfg))
|
||||||
|
retrySendRequest(false)
|
||||||
|
|
||||||
|
// Add a new static-server
|
||||||
|
cfg = sp.Config()
|
||||||
|
cluster = cfg.Cluster("dc1")
|
||||||
|
cluster.Nodes[3].Disabled = false // client 3 -- static-server
|
||||||
|
require.NoError(t, sp.Relaunch(cfg))
|
||||||
|
|
||||||
|
// Ensure the static-client connected to static-server
|
||||||
|
retrySendRequest(true)
|
||||||
|
}
|
|
@ -180,6 +180,7 @@ func (s *Sprawl) assignIPAddresses() error {
|
||||||
return fmt.Errorf("unknown network %q", addr.Network)
|
return fmt.Errorf("unknown network %q", addr.Network)
|
||||||
}
|
}
|
||||||
addr.IPAddress = net.IPByIndex(node.Index)
|
addr.IPAddress = net.IPByIndex(node.Index)
|
||||||
|
s.logger.Info("assign addr", "node", node.Name, "addr", addr.IPAddress)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -504,7 +505,7 @@ func (s *Sprawl) waitForClientAntiEntropyOnce(cluster *topology.Cluster) error {
|
||||||
logger.Debug("all nodes have posted node updates, so first anti-entropy has happened", "elapsed", dur)
|
logger.Debug("all nodes have posted node updates, so first anti-entropy has happened", "elapsed", dur)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
logger.Debug("not all client nodes have posted node updates yet", "nodes", stragglers)
|
logger.Debug("not all nodes have posted node updates yet", "nodes", stragglers)
|
||||||
|
|
||||||
time.Sleep(1 * time.Second)
|
time.Sleep(1 * time.Second)
|
||||||
}
|
}
|
||||||
|
@ -514,10 +515,10 @@ func newGossipKey() (string, error) {
|
||||||
key := make([]byte, 16)
|
key := make([]byte, 16)
|
||||||
n, err := rand.Reader.Read(key)
|
n, err := rand.Reader.Read(key)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("Error reading random data: %s", err)
|
return "", fmt.Errorf("error reading random data: %s", err)
|
||||||
}
|
}
|
||||||
if n != 16 {
|
if n != 16 {
|
||||||
return "", fmt.Errorf("Couldn't read enough entropy. Generate more entropy!")
|
return "", fmt.Errorf("couldn't read enough entropy. Generate more entropy")
|
||||||
}
|
}
|
||||||
return base64.StdEncoding.EncodeToString(key), nil
|
return base64.StdEncoding.EncodeToString(key), nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -235,6 +235,32 @@ func (s *Sprawl) Relaunch(
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SnapshotSave saves a snapshot of a cluster and restore with the snapshot
|
||||||
|
func (s *Sprawl) SnapshotSave(clusterName string) error {
|
||||||
|
cluster, ok := s.topology.Clusters[clusterName]
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("no such cluster: %s", clusterName)
|
||||||
|
}
|
||||||
|
var (
|
||||||
|
client = s.clients[cluster.Name]
|
||||||
|
)
|
||||||
|
snapshot := client.Snapshot()
|
||||||
|
snap, _, err := snapshot.Save(nil)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error saving snapshot: %w", err)
|
||||||
|
}
|
||||||
|
s.logger.Info("snapshot saved")
|
||||||
|
time.Sleep(3 * time.Second)
|
||||||
|
defer snap.Close()
|
||||||
|
|
||||||
|
// Restore the snapshot.
|
||||||
|
if err := snapshot.Restore(nil, snap); err != nil {
|
||||||
|
return fmt.Errorf("error restoring snapshot: %w", err)
|
||||||
|
}
|
||||||
|
s.logger.Info("snapshot restored")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Leader returns the cluster leader agent, or an error if no leader is
|
// Leader returns the cluster leader agent, or an error if no leader is
|
||||||
// available.
|
// available.
|
||||||
func (s *Sprawl) Leader(clusterName string) (*topology.Node, error) {
|
func (s *Sprawl) Leader(clusterName string) (*topology.Node, error) {
|
||||||
|
|
|
@ -3,5 +3,5 @@
|
||||||
|
|
||||||
package topology
|
package topology
|
||||||
|
|
||||||
const DefaultConsulImage = "hashicorp/consul:1.15.2"
|
const DefaultConsulImage = "hashicorp/consul:1.16.2"
|
||||||
const DefaultConsulEnterpriseImage = "hashicorp/consul-enterprise:1.15.2-ent"
|
const DefaultConsulEnterpriseImage = "hashicorp/consul-enterprise:1.16.2-ent"
|
||||||
|
|
|
@ -515,7 +515,7 @@ func (n *Node) LocalAddress() string {
|
||||||
for _, a := range n.Addresses {
|
for _, a := range n.Addresses {
|
||||||
if a.IsLocal() {
|
if a.IsLocal() {
|
||||||
if a.IPAddress == "" {
|
if a.IPAddress == "" {
|
||||||
panic("node has no assigned local address")
|
panic("node has no assigned local address: " + n.Name)
|
||||||
}
|
}
|
||||||
return a.IPAddress
|
return a.IPAddress
|
||||||
}
|
}
|
||||||
|
@ -538,7 +538,7 @@ func (n *Node) LocalProxyPort() int {
|
||||||
if a.ProxyPort > 0 {
|
if a.ProxyPort > 0 {
|
||||||
return a.ProxyPort
|
return a.ProxyPort
|
||||||
}
|
}
|
||||||
panic("node has no assigned local address")
|
panic("node has no assigned local address: " + n.Name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
panic("node has no local network")
|
panic("node has no local network")
|
||||||
|
|
Loading…
Reference in New Issue