Enable running autopilot state updates on all servers (#12617)

* Fixes a lint warning about t.Errorf not supporting %w

* Enable running autopilot on all servers

On the non-leader servers all they do is update the state and do not attempt any modifications.

* Fix the RPC conn limiting tests

Technically they were relying on racey behavior before. Now they should be reliable.
This commit is contained in:
Matt Keeler 2022-04-07 10:48:48 -04:00 committed by GitHub
parent b3c7f44d32
commit a553982506
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 56 additions and 44 deletions

9
.changelog/12617.txt Normal file
View File

@ -0,0 +1,9 @@
```release-note:improvement
autopilot: Autopilot state is now tracked on Raft followers in addition to the leader.
Stale queries may be used to query for the non-leaders state.
```
```release-note:improvement
autopilot: The `autopilot.healthy` and `autopilot.failure_tolerance` metrics are now
regularly emitted by all servers.
```

View File

@ -9,10 +9,10 @@ import (
"github.com/hashicorp/raft"
autopilot "github.com/hashicorp/raft-autopilot"
"github.com/hashicorp/serf/serf"
"math"
"github.com/hashicorp/consul/agent/metadata"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/types"
)
@ -33,7 +33,7 @@ type AutopilotDelegate struct {
}
func (d *AutopilotDelegate) AutopilotConfig() *autopilot.Config {
return d.server.getOrCreateAutopilotConfig().ToAutopilotLibraryConfig()
return d.server.getAutopilotConfigOrDefault().ToAutopilotLibraryConfig()
}
func (d *AutopilotDelegate) KnownServers() map[raft.ServerID]*autopilot.Server {
@ -45,24 +45,12 @@ func (d *AutopilotDelegate) FetchServerStats(ctx context.Context, servers map[ra
}
func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
// emit metrics if we are the leader regarding overall healthiness and the failure tolerance
if d.server.raft.State() == raft.Leader {
metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(state.FailureTolerance))
if state.Healthy {
metrics.SetGauge([]string{"autopilot", "healthy"}, 1)
} else {
metrics.SetGauge([]string{"autopilot", "healthy"}, 0)
}
} else {
// if we are not a leader, emit NaN per
// https://www.consul.io/docs/agent/telemetry#autopilot
metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
// also emit NaN for failure tolerance to be backwards compatible
metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN()))
}
}
func (d *AutopilotDelegate) RemoveFailedServer(srv *autopilot.Server) {
@ -84,10 +72,8 @@ func (s *Server) initAutopilot(config *Config) {
autopilot.WithReconcileInterval(config.AutopilotInterval),
autopilot.WithUpdateInterval(config.ServerHealthInterval),
autopilot.WithPromoter(s.autopilotPromoter()),
autopilot.WithReconciliationDisabled(),
)
metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN()))
}
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
@ -154,3 +140,22 @@ func (s *Server) autopilotServerFromMetadata(srv *metadata.Server) (*autopilot.S
return server, nil
}
func (s *Server) getAutopilotConfigOrDefault() *structs.AutopilotConfig {
logger := s.loggers.Named(logging.Autopilot)
state := s.fsm.State()
_, config, err := state.AutopilotConfig()
if err != nil {
logger.Error("failed to get config", "error", err)
return nil
}
if config != nil {
return config
}
// autopilot may start running prior to there ever being a leader
// and having an autopilot configuration created. In that case
// use the one from the local configuration for now.
return s.config.AutopilotConfig
}

View File

@ -297,7 +297,7 @@ func (s *Server) establishLeadership(ctx context.Context) error {
}
s.getOrCreateAutopilotConfig()
s.autopilot.Start(ctx)
s.autopilot.EnableReconciliation()
s.startConfigReplication(ctx)
@ -350,9 +350,7 @@ func (s *Server) revokeLeadership() {
s.resetConsistentReadReady()
// Stop returns a chan and we want to block until it is closed
// which indicates that autopilot is actually stopped.
<-s.autopilot.Stop()
s.autopilot.DisableReconciliation()
}
// initializeACLs is used to setup the ACLs if we are the leader

View File

@ -2,6 +2,7 @@ package consul
import (
"fmt"
autopilot "github.com/hashicorp/raft-autopilot"
"github.com/hashicorp/serf/serf"
@ -75,10 +76,6 @@ func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRe
// ServerHealth is used to get the current health of the servers.
func (op *Operator) ServerHealth(args *structs.DCSpecificRequest, reply *structs.AutopilotHealthReply) error {
// This must be sent to the leader, so we fix the args since we are
// re-using a structure where we don't support all the options.
args.RequireConsistent = true
args.AllowStale = false
if done, err := op.srv.ForwardRPC("Operator.ServerHealth", args, reply); done {
return err
}
@ -143,10 +140,6 @@ func (op *Operator) ServerHealth(args *structs.DCSpecificRequest, reply *structs
}
func (op *Operator) AutopilotState(args *structs.DCSpecificRequest, reply *autopilot.State) error {
// This must be sent to the leader, so we fix the args since we are
// re-using a structure where we don't support all the options.
args.RequireConsistent = true
args.AllowStale = false
if done, err := op.srv.ForwardRPC("Operator.AutopilotState", args, reply); done {
return err
}

View File

@ -817,7 +817,8 @@ func TestRPC_RPCMaxConnsPerClient(t *testing.T) {
tc := tc
t.Run(tc.name, func(t *testing.T) {
dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.RPCMaxConnsPerClient = 2
// we have to set this to 3 because autopilot is going to keep a connection open
c.RPCMaxConnsPerClient = 3
if tc.tlsEnabled {
c.TLSConfig.InternalRPC.CAFile = "../../test/hostname/CertAuth.crt"
c.TLSConfig.InternalRPC.CertFile = "../../test/hostname/Alice.crt"
@ -831,6 +832,8 @@ func TestRPC_RPCMaxConnsPerClient(t *testing.T) {
defer os.RemoveAll(dir1)
defer s1.Shutdown()
waitForLeaderEstablishment(t, s1)
// Connect to the server with bare TCP
conn1 := connectClient(t, s1, tc.magicByte, tc.tlsEnabled, true, "conn1")
defer conn1.Close()
@ -847,7 +850,7 @@ func TestRPC_RPCMaxConnsPerClient(t *testing.T) {
addr := conn1.RemoteAddr()
conn1.Close()
retry.Run(t, func(r *retry.R) {
if n := s1.rpcConnLimiter.NumOpen(addr); n >= 2 {
if n := s1.rpcConnLimiter.NumOpen(addr); n >= 3 {
r.Fatal("waiting for open conns to drop")
}
})
@ -1736,7 +1739,7 @@ func rpcBlockingQueryTestHarness(
return
case err := <-errCh:
if err != nil {
t.Errorf("[%d] unexpected error: %w", i, err)
t.Errorf("[%d] unexpected error: %v", i, err)
return
}
}

View File

@ -674,6 +674,10 @@ func NewServer(config *Config, flat Deps, publicGRPCServer *grpc.Server) (*Serve
go s.listen(listener)
}
// start autopilot - this must happen after the RPC listeners get setup
// or else it may block
s.autopilot.Start(&lib.StopChannelContext{StopCh: s.shutdownCh})
// Start the metrics handlers.
go s.updateMetrics()

View File

@ -250,8 +250,8 @@ func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) {
respRec := httptest.NewRecorder()
recordPromMetrics(t, a, respRec)
assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_healthy", "NaN")
assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "NaN")
assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_healthy", "1")
assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "0")
})
}

2
go.mod
View File

@ -54,7 +54,7 @@ require (
github.com/hashicorp/hil v0.0.0-20200423225030-a18a1cd20038
github.com/hashicorp/memberlist v0.3.1
github.com/hashicorp/raft v1.3.6
github.com/hashicorp/raft-autopilot v0.1.5
github.com/hashicorp/raft-autopilot v0.1.6
github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42 // indirect
github.com/hashicorp/raft-boltdb/v2 v2.2.2
github.com/hashicorp/serf v0.9.7

4
go.sum
View File

@ -363,8 +363,8 @@ github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7
github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
github.com/hashicorp/raft v1.3.6 h1:v5xW5KzByoerQlN/o31VJrFNiozgzGyDoMgDJgXpsto=
github.com/hashicorp/raft v1.3.6/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM=
github.com/hashicorp/raft-autopilot v0.1.5 h1:onEfMH5uHVdXQqtas36zXUHEZxLdsJVu/nXHLcLdL1I=
github.com/hashicorp/raft-autopilot v0.1.5/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
github.com/hashicorp/raft-autopilot v0.1.6 h1:C1q3RNF2FfXNZfHWbvVAu0QixaQK8K5pX4O5lh+9z4I=
github.com/hashicorp/raft-autopilot v0.1.6/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
github.com/hashicorp/raft-boltdb v0.0.0-20210409134258-03c10cc3d4ea/go.mod h1:qRd6nFJYYS6Iqnc/8HcUmko2/2Gw8qTFEmxDLii6W5I=
github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42 h1:Ye8SofeDHJzu9xvvaMmpMkqHELWW7rTcXwdUR0CWW48=

View File

@ -94,7 +94,7 @@ These are some metrics emitted that can help you understand the health of your c
| Metric Name | Description | Unit | Type |
| :------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------- | :---- |
| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. All non-leader servers will report `NaN`. | health state | gauge |
| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. | health state | gauge |
**Why it's important:** Autopilot can expose the overall health of your cluster with a simple boolean.
@ -592,8 +592,8 @@ These metrics give insight into the health of the cluster as a whole.
| `consul.serf.member.left` | Increments when an agent leaves the cluster. | leaves / interval | counter |
| `consul.serf.events` | Increments when an agent processes an [event](/commands/event). Consul uses events internally so there may be additional events showing in telemetry. There are also a per-event counters emitted as `consul.serf.events.`. | events / interval | counter |
| `consul.serf.msgs.sent` | This metric is sample of the number of bytes of messages broadcast to the cluster. In a given time interval, the sum of this metric is the total number of bytes sent and the count is the number of messages sent. | message bytes / interval | counter |
| `consul.autopilot.failure_tolerance` | Tracks the number of voting servers that the cluster can lose while continuing to function. | servers | gauge |
| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. All non-leader servers will report `NaN`. | boolean | gauge |
| `consul.autopilot.failure_tolerance` | Tracks the number of voting servers that the cluster can lose while continuing to function. | servers   | gauge |
| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. | boolean   | gauge |
| `consul.session_ttl.active` | Tracks the active number of sessions being tracked. | sessions | gauge |
| `consul.catalog.service.query.` | Increments for each catalog query for the given service. | queries | counter |
| `consul.catalog.service.query-tag..` | Increments for each catalog query for the given service with the given tag. | queries | counter |