diff --git a/.circleci/config.yml b/.circleci/config.yml index e20050f33d..a189602a28 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -587,9 +587,8 @@ workflows: - dev-build filters: branches: - only: - - /^bug\/flaky-test-.*$/ # only run go tests on bug/flaky-test-* for now since we are fixing tests - - release/1-6 + ignore: + - /^pull\/.*$/ # only run go tests on non forks - go-test-api: *go-test - dev-upload-s3: requires: diff --git a/CHANGELOG.md b/CHANGELOG.md index e63341f2eb..db7ea59339 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ IMPROVEMENTS: BUG FIXES: * autopilot: update to also remove failed nodes from WAN gossip pool [[GH-6028](https://github.com/hashicorp/consul/pull/6028)] +* agent: avoid reverting any check updates that occur while a service is being added or the config is reloaded [[GH-6144](https://github.com/hashicorp/consul/issues/6144)] ## 1.5.2 (June 27, 2019) diff --git a/agent/agent.go b/agent/agent.go index a8885b9e60..f5f20cc8f2 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -457,7 +457,7 @@ func (a *Agent) Start() error { if err := a.loadProxies(c); err != nil { return err } - if err := a.loadChecks(c); err != nil { + if err := a.loadChecks(c, nil); err != nil { return err } if err := a.loadMetadata(c); err != nil { @@ -2109,10 +2109,10 @@ func (a *Agent) addServiceInternal(service *structs.NodeService, chkTypes []*str a.PauseSync() defer a.ResumeSync() - // Take a snapshot of the current state of checks (if any), and - // restore them before resuming anti-entropy. + // Take a snapshot of the current state of checks (if any), and when adding + // a check that already existed carry over the state before resuming + // anti-entropy. snap := a.snapshotCheckState() - defer a.restoreCheckState(snap) var checks []*structs.HealthCheck @@ -2143,6 +2143,13 @@ func (a *Agent) addServiceInternal(service *structs.NodeService, chkTypes []*str check.Status = chkType.Status } + // Restore the fields from the snapshot. + prev, ok := snap[check.CheckID] + if ok { + check.Output = prev.Output + check.Status = prev.Status + } + checks = append(checks, check) } @@ -3346,10 +3353,17 @@ func (a *Agent) unloadServices() error { // loadChecks loads check definitions and/or persisted check definitions from // disk and re-registers them with the local agent. -func (a *Agent) loadChecks(conf *config.RuntimeConfig) error { +func (a *Agent) loadChecks(conf *config.RuntimeConfig, snap map[types.CheckID]*structs.HealthCheck) error { // Register the checks from config for _, check := range conf.Checks { health := check.HealthCheck(conf.NodeName) + + // Restore the fields from the snapshot. + if prev, ok := snap[health.CheckID]; ok { + health.Output = prev.Output + health.Status = prev.Status + } + chkType := check.CheckType() if err := a.addCheckLocked(health, chkType, false, check.Token, ConfigSourceLocal); err != nil { return fmt.Errorf("Failed to register check '%s': %v %v", check.Name, err, check) @@ -3406,6 +3420,12 @@ func (a *Agent) loadChecks(conf *config.RuntimeConfig) error { // services into the active pool p.Check.Status = api.HealthCritical + // Restore the fields from the snapshot. + if prev, ok := snap[p.Check.CheckID]; ok { + p.Check.Output = prev.Output + p.Check.Status = prev.Status + } + if err := a.addCheckLocked(p.Check, p.ChkType, false, p.Token, ConfigSourceLocal); err != nil { // Purge the check if it is unable to be restored. a.logger.Printf("[WARN] agent: Failed to restore check %q: %s", @@ -3634,15 +3654,6 @@ func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck { return a.State.Checks() } -// restoreCheckState is used to reset the health state based on a snapshot. -// This is done after we finish the reload to avoid any unnecessary flaps -// in health state and potential session invalidations. -func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) { - for id, check := range snap { - a.State.UpdateCheck(id, check.Status, check.Output) - } -} - // loadMetadata loads node metadata fields from the agent config and // updates them on the local agent. func (a *Agent) loadMetadata(conf *config.RuntimeConfig) error { @@ -3765,9 +3776,9 @@ func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error { a.stateLock.Lock() defer a.stateLock.Unlock() - // Snapshot the current state, and restore it afterwards + // Snapshot the current state, and use that to initialize the checks when + // they are recreated. snap := a.snapshotCheckState() - defer a.restoreCheckState(snap) // First unload all checks, services, and metadata. This lets us begin the reload // with a clean slate. @@ -3798,7 +3809,7 @@ func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error { if err := a.loadProxies(newCfg); err != nil { return fmt.Errorf("Failed reloading proxies: %s", err) } - if err := a.loadChecks(newCfg); err != nil { + if err := a.loadChecks(newCfg, snap); err != nil { return fmt.Errorf("Failed reloading checks: %s", err) } if err := a.loadMetadata(newCfg); err != nil { diff --git a/agent/agent_test.go b/agent/agent_test.go index dda56335de..9265c52ae1 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -500,6 +500,62 @@ func TestAgent_AddService(t *testing.T) { } } +func TestAgent_AddServices_AliasUpdateCheckNotReverted(t *testing.T) { + t.Parallel() + a := NewTestAgent(t, t.Name(), ` + node_name = "node1" + `) + defer a.Shutdown() + + // It's tricky to get an UpdateCheck call to be timed properly so it lands + // right in the middle of an addServiceInternal call so we cheat a bit and + // rely upon alias checks to do that work for us. We add enough services + // that probabilistically one of them is going to end up properly in the + // critical section. + // + // The first number I picked here (10) surprisingly failed every time prior + // to PR #6144 solving the underlying problem. + const numServices = 10 + + services := make([]*structs.ServiceDefinition, numServices) + checkIDs := make([]types.CheckID, numServices) + for i := 0; i < numServices; i++ { + name := fmt.Sprintf("web-%d", i) + + services[i] = &structs.ServiceDefinition{ + ID: name, + Name: name, + Port: 8080 + i, + Checks: []*structs.CheckType{ + &structs.CheckType{ + Name: "alias-for-fake-service", + AliasService: "fake", + }, + }, + } + + checkIDs[i] = types.CheckID("service:" + name) + } + + // Add all of the services quickly as you might do from config file snippets. + for _, service := range services { + ns := service.NodeService() + + chkTypes, err := service.CheckTypes() + require.NoError(t, err) + + require.NoError(t, a.AddService(ns, chkTypes, false, service.Token, ConfigSourceLocal)) + } + + retry.Run(t, func(r *retry.R) { + gotChecks := a.State.Checks() + for id, check := range gotChecks { + require.Equal(r, "passing", check.Status, "check %q is wrong", id) + require.Equal(r, "No checks found.", check.Output, "check %q is wrong", id) + } + }) +} + func TestAgent_AddServiceNoExec(t *testing.T) { t.Parallel() a := NewTestAgent(t, t.Name(), ` @@ -2965,14 +3021,11 @@ func TestAgent_checkStateSnapshot(t *testing.T) { t.Fatalf("err: %s", err) } - // Reload the checks - if err := a.loadChecks(a.Config); err != nil { + // Reload the checks and restore the snapshot. + if err := a.loadChecks(a.Config, snap); err != nil { t.Fatalf("err: %s", err) } - // Restore the state - a.restoreCheckState(snap) - // Search for the check out, ok := a.State.Checks()[check1.CheckID] if !ok { @@ -3010,7 +3063,7 @@ func TestAgent_loadChecks_checkFails(t *testing.T) { } // Try loading the checks from the persisted files - if err := a.loadChecks(a.Config); err != nil { + if err := a.loadChecks(a.Config, nil); err != nil { t.Fatalf("err: %s", err) } diff --git a/agent/consul/leader_test.go b/agent/consul/leader_test.go index 7372bef730..e394722885 100644 --- a/agent/consul/leader_test.go +++ b/agent/consul/leader_test.go @@ -68,13 +68,15 @@ func TestLeader_RegisterMember(t *testing.T) { } // Server should be registered - _, node, err := state.GetNode(s1.config.NodeName) - if err != nil { - t.Fatalf("err: %v", err) - } - if node == nil { - t.Fatalf("server not registered") - } + retry.Run(t, func(r *retry.R) { + _, node, err := state.GetNode(s1.config.NodeName) + if err != nil { + r.Fatalf("err: %v", err) + } + if node == nil { + r.Fatalf("server not registered") + } + }) // Service should be registered _, services, err := state.NodeServices(nil, s1.config.NodeName) diff --git a/agent/consul/server_test.go b/agent/consul/server_test.go index c3c719481d..32eadb232c 100644 --- a/agent/consul/server_test.go +++ b/agent/consul/server_test.go @@ -742,53 +742,41 @@ func TestServer_AvoidReBootstrap(t *testing.T) { func TestServer_Expect_NonVoters(t *testing.T) { t.Parallel() - dir1, s1 := testServerDCExpectNonVoter(t, "dc1", 3) + dir1, s1 := testServerDCExpectNonVoter(t, "dc1", 2) defer os.RemoveAll(dir1) defer s1.Shutdown() - dir2, s2 := testServerDCExpectNonVoter(t, "dc1", 3) + dir2, s2 := testServerDCExpect(t, "dc1", 2) defer os.RemoveAll(dir2) defer s2.Shutdown() - dir3, s3 := testServerDCExpect(t, "dc1", 3) + dir3, s3 := testServerDCExpect(t, "dc1", 2) defer os.RemoveAll(dir3) defer s3.Shutdown() - dir4, s4 := testServerDCExpect(t, "dc1", 3) - defer os.RemoveAll(dir4) - defer s4.Shutdown() - - dir5, s5 := testServerDCExpect(t, "dc1", 3) - defer os.RemoveAll(dir5) - defer s5.Shutdown() - - // Join the first three servers. + // Join the first two servers. joinLAN(t, s2, s1) - joinLAN(t, s3, s1) // Should have no peers yet since the bootstrap didn't occur. retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s1, 0)) r.Check(wantPeers(s2, 0)) - r.Check(wantPeers(s3, 0)) }) - // Join the fourth node. - joinLAN(t, s4, s1) - joinLAN(t, s5, s1) + // Join the third node. + joinLAN(t, s3, s1) // Now we have three servers so we should bootstrap. retry.Run(t, func(r *retry.R) { - r.Check(wantPeers(s1, 3)) - r.Check(wantPeers(s2, 3)) - r.Check(wantPeers(s3, 3)) - r.Check(wantPeers(s4, 3)) + r.Check(wantPeers(s1, 2)) + r.Check(wantPeers(s2, 2)) + r.Check(wantPeers(s3, 2)) }) // Make sure a leader is elected testrpc.WaitForLeader(t, s1.RPC, "dc1") retry.Run(t, func(r *retry.R) { - r.Check(wantRaft([]*Server{s1, s2, s3, s4, s5})) + r.Check(wantRaft([]*Server{s1, s2, s3})) }) } diff --git a/api/txn_test.go b/api/txn_test.go index e2106d7475..b940546d19 100644 --- a/api/txn_test.go +++ b/api/txn_test.go @@ -1,19 +1,18 @@ package api import ( + "github.com/hashicorp/consul/sdk/testutil/retry" "strings" "testing" "time" "github.com/hashicorp/go-uuid" - "github.com/pascaldekloe/goe/verify" "github.com/stretchr/testify/require" ) func TestAPI_ClientTxn(t *testing.T) { t.Parallel() - require := require.New(t) c, s := makeClient(t) defer s.Stop() @@ -24,7 +23,7 @@ func TestAPI_ClientTxn(t *testing.T) { // Set up a test service and health check. nodeID, err := uuid.GenerateUUID() - require.NoError(err) + require.NoError(t, err) catalog := c.Catalog() reg := &CatalogRegistration{ @@ -59,11 +58,11 @@ func TestAPI_ClientTxn(t *testing.T) { }, } _, err = catalog.Register(reg, nil) - require.NoError(err) + require.NoError(t, err) node, _, err := catalog.Node("foo", nil) - require.NoError(err) - require.Equal(nodeID, node.Node.ID) + require.NoError(t, err) + require.Equal(t, nodeID, node.Node.ID) // Make a session. id, _, err := session.CreateNoChecks(nil, nil) @@ -218,58 +217,60 @@ func TestAPI_ClientTxn(t *testing.T) { }, }, } - verify.Values(t, "", ret.Results, expected) + require.Equal(t, ret.Results, expected) - // Run a read-only transaction. - ops = TxnOps{ - &TxnOp{ - KV: &KVTxnOp{ - Verb: KVGet, - Key: key, - }, - }, - &TxnOp{ - Node: &NodeTxnOp{ - Verb: NodeGet, - Node: Node{ID: s.Config.NodeID, Node: s.Config.NodeName}, - }, - }, - } - ok, ret, _, err = txn.Txn(ops, nil) - if err != nil { - t.Fatalf("err: %v", err) - } else if !ok { - t.Fatalf("transaction failure") - } - - expected = TxnResults{ - &TxnResult{ - KV: &KVPair{ - Key: key, - Session: id, - Value: []byte("test"), - LockIndex: 1, - CreateIndex: ret.Results[0].KV.CreateIndex, - ModifyIndex: ret.Results[0].KV.ModifyIndex, - }, - }, - &TxnResult{ - Node: &Node{ - ID: s.Config.NodeID, - Node: s.Config.NodeName, - Address: "127.0.0.1", - Datacenter: "dc1", - TaggedAddresses: map[string]string{ - "lan": s.Config.Bind, - "wan": s.Config.Bind, + retry.Run(t, func(r *retry.R) { + // Run a read-only transaction. + ops = TxnOps{ + &TxnOp{ + KV: &KVTxnOp{ + Verb: KVGet, + Key: key, }, - Meta: map[string]string{"consul-network-segment": ""}, - CreateIndex: ret.Results[1].Node.CreateIndex, - ModifyIndex: ret.Results[1].Node.ModifyIndex, }, - }, - } - verify.Values(t, "", ret.Results, expected) + &TxnOp{ + Node: &NodeTxnOp{ + Verb: NodeGet, + Node: Node{ID: s.Config.NodeID, Node: s.Config.NodeName}, + }, + }, + } + ok, ret, _, err = txn.Txn(ops, nil) + if err != nil { + r.Fatalf("err: %v", err) + } else if !ok { + r.Fatalf("transaction failure") + } + + expected = TxnResults{ + &TxnResult{ + KV: &KVPair{ + Key: key, + Session: id, + Value: []byte("test"), + LockIndex: 1, + CreateIndex: ret.Results[0].KV.CreateIndex, + ModifyIndex: ret.Results[0].KV.ModifyIndex, + }, + }, + &TxnResult{ + Node: &Node{ + ID: s.Config.NodeID, + Node: s.Config.NodeName, + Address: "127.0.0.1", + Datacenter: "dc1", + TaggedAddresses: map[string]string{ + "lan": s.Config.Bind, + "wan": s.Config.Bind, + }, + Meta: map[string]string{"consul-network-segment": ""}, + CreateIndex: ret.Results[1].Node.CreateIndex, + ModifyIndex: ret.Results[1].Node.ModifyIndex, + }, + }, + } + require.Equal(r, ret.Results, expected) + }) // Sanity check using the regular GET API. kv := c.KV() diff --git a/api/watch/funcs_test.go b/api/watch/funcs_test.go index be70f591c2..5e7091e23b 100644 --- a/api/watch/funcs_test.go +++ b/api/watch/funcs_test.go @@ -715,7 +715,7 @@ func TestConnectRootsWatch(t *testing.T) { return // ignore } v, ok := raw.(*api.CARootList) - if !ok || v == nil { + if !ok || v == nil || len(v.Roots) == 0 { return // ignore } // Only 1 CA is the bootstrapped state (i.e. first response). Ignore this