diff --git a/.changelog/12617.txt b/.changelog/12617.txt new file mode 100644 index 0000000000..25ae7f9ecf --- /dev/null +++ b/.changelog/12617.txt @@ -0,0 +1,9 @@ +```release-note:improvement +autopilot: Autopilot state is now tracked on Raft followers in addition to the leader. +Stale queries may be used to query for the non-leaders state. +``` + +```release-note:improvement +autopilot: The `autopilot.healthy` and `autopilot.failure_tolerance` metrics are now +regularly emitted by all servers. +``` diff --git a/agent/consul/autopilot.go b/agent/consul/autopilot.go index 8d17e49485..27471b533d 100644 --- a/agent/consul/autopilot.go +++ b/agent/consul/autopilot.go @@ -9,10 +9,10 @@ import ( "github.com/hashicorp/raft" autopilot "github.com/hashicorp/raft-autopilot" "github.com/hashicorp/serf/serf" - "math" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/types" ) @@ -33,7 +33,7 @@ type AutopilotDelegate struct { } func (d *AutopilotDelegate) AutopilotConfig() *autopilot.Config { - return d.server.getOrCreateAutopilotConfig().ToAutopilotLibraryConfig() + return d.server.getAutopilotConfigOrDefault().ToAutopilotLibraryConfig() } func (d *AutopilotDelegate) KnownServers() map[raft.ServerID]*autopilot.Server { @@ -45,23 +45,11 @@ func (d *AutopilotDelegate) FetchServerStats(ctx context.Context, servers map[ra } func (d *AutopilotDelegate) NotifyState(state *autopilot.State) { - // emit metrics if we are the leader regarding overall healthiness and the failure tolerance - if d.server.raft.State() == raft.Leader { - metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(state.FailureTolerance)) - if state.Healthy { - metrics.SetGauge([]string{"autopilot", "healthy"}, 1) - } else { - metrics.SetGauge([]string{"autopilot", "healthy"}, 0) - } + metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(state.FailureTolerance)) + if state.Healthy { + metrics.SetGauge([]string{"autopilot", "healthy"}, 1) } else { - - // if we are not a leader, emit NaN per - // https://www.consul.io/docs/agent/telemetry#autopilot - metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN())) - - // also emit NaN for failure tolerance to be backwards compatible - metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN())) - + metrics.SetGauge([]string{"autopilot", "healthy"}, 0) } } @@ -84,10 +72,8 @@ func (s *Server) initAutopilot(config *Config) { autopilot.WithReconcileInterval(config.AutopilotInterval), autopilot.WithUpdateInterval(config.ServerHealthInterval), autopilot.WithPromoter(s.autopilotPromoter()), + autopilot.WithReconciliationDisabled(), ) - - metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN())) - metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN())) } func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server { @@ -154,3 +140,22 @@ func (s *Server) autopilotServerFromMetadata(srv *metadata.Server) (*autopilot.S return server, nil } + +func (s *Server) getAutopilotConfigOrDefault() *structs.AutopilotConfig { + logger := s.loggers.Named(logging.Autopilot) + state := s.fsm.State() + _, config, err := state.AutopilotConfig() + if err != nil { + logger.Error("failed to get config", "error", err) + return nil + } + + if config != nil { + return config + } + + // autopilot may start running prior to there ever being a leader + // and having an autopilot configuration created. In that case + // use the one from the local configuration for now. + return s.config.AutopilotConfig +} diff --git a/agent/consul/leader.go b/agent/consul/leader.go index f40faed42a..456fbec1ea 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -297,7 +297,7 @@ func (s *Server) establishLeadership(ctx context.Context) error { } s.getOrCreateAutopilotConfig() - s.autopilot.Start(ctx) + s.autopilot.EnableReconciliation() s.startConfigReplication(ctx) @@ -350,9 +350,7 @@ func (s *Server) revokeLeadership() { s.resetConsistentReadReady() - // Stop returns a chan and we want to block until it is closed - // which indicates that autopilot is actually stopped. - <-s.autopilot.Stop() + s.autopilot.DisableReconciliation() } // initializeACLs is used to setup the ACLs if we are the leader diff --git a/agent/consul/operator_autopilot_endpoint.go b/agent/consul/operator_autopilot_endpoint.go index 0b3aee53f2..babbb79561 100644 --- a/agent/consul/operator_autopilot_endpoint.go +++ b/agent/consul/operator_autopilot_endpoint.go @@ -2,6 +2,7 @@ package consul import ( "fmt" + autopilot "github.com/hashicorp/raft-autopilot" "github.com/hashicorp/serf/serf" @@ -75,10 +76,6 @@ func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRe // ServerHealth is used to get the current health of the servers. func (op *Operator) ServerHealth(args *structs.DCSpecificRequest, reply *structs.AutopilotHealthReply) error { - // This must be sent to the leader, so we fix the args since we are - // re-using a structure where we don't support all the options. - args.RequireConsistent = true - args.AllowStale = false if done, err := op.srv.ForwardRPC("Operator.ServerHealth", args, reply); done { return err } @@ -143,10 +140,6 @@ func (op *Operator) ServerHealth(args *structs.DCSpecificRequest, reply *structs } func (op *Operator) AutopilotState(args *structs.DCSpecificRequest, reply *autopilot.State) error { - // This must be sent to the leader, so we fix the args since we are - // re-using a structure where we don't support all the options. - args.RequireConsistent = true - args.AllowStale = false if done, err := op.srv.ForwardRPC("Operator.AutopilotState", args, reply); done { return err } diff --git a/agent/consul/rpc_test.go b/agent/consul/rpc_test.go index 0e236eed59..5e1323a1e2 100644 --- a/agent/consul/rpc_test.go +++ b/agent/consul/rpc_test.go @@ -817,7 +817,8 @@ func TestRPC_RPCMaxConnsPerClient(t *testing.T) { tc := tc t.Run(tc.name, func(t *testing.T) { dir1, s1 := testServerWithConfig(t, func(c *Config) { - c.RPCMaxConnsPerClient = 2 + // we have to set this to 3 because autopilot is going to keep a connection open + c.RPCMaxConnsPerClient = 3 if tc.tlsEnabled { c.TLSConfig.InternalRPC.CAFile = "../../test/hostname/CertAuth.crt" c.TLSConfig.InternalRPC.CertFile = "../../test/hostname/Alice.crt" @@ -831,6 +832,8 @@ func TestRPC_RPCMaxConnsPerClient(t *testing.T) { defer os.RemoveAll(dir1) defer s1.Shutdown() + waitForLeaderEstablishment(t, s1) + // Connect to the server with bare TCP conn1 := connectClient(t, s1, tc.magicByte, tc.tlsEnabled, true, "conn1") defer conn1.Close() @@ -847,7 +850,7 @@ func TestRPC_RPCMaxConnsPerClient(t *testing.T) { addr := conn1.RemoteAddr() conn1.Close() retry.Run(t, func(r *retry.R) { - if n := s1.rpcConnLimiter.NumOpen(addr); n >= 2 { + if n := s1.rpcConnLimiter.NumOpen(addr); n >= 3 { r.Fatal("waiting for open conns to drop") } }) @@ -1736,7 +1739,7 @@ func rpcBlockingQueryTestHarness( return case err := <-errCh: if err != nil { - t.Errorf("[%d] unexpected error: %w", i, err) + t.Errorf("[%d] unexpected error: %v", i, err) return } } diff --git a/agent/consul/server.go b/agent/consul/server.go index 3ec3d61dde..a3effba97a 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -674,6 +674,10 @@ func NewServer(config *Config, flat Deps, publicGRPCServer *grpc.Server) (*Serve go s.listen(listener) } + // start autopilot - this must happen after the RPC listeners get setup + // or else it may block + s.autopilot.Start(&lib.StopChannelContext{StopCh: s.shutdownCh}) + // Start the metrics handlers. go s.updateMetrics() diff --git a/agent/metrics_test.go b/agent/metrics_test.go index 2aedc01807..448694e3e9 100644 --- a/agent/metrics_test.go +++ b/agent/metrics_test.go @@ -250,8 +250,8 @@ func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) { respRec := httptest.NewRecorder() recordPromMetrics(t, a, respRec) - assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_healthy", "NaN") - assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "NaN") + assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_healthy", "1") + assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "0") }) } diff --git a/go.mod b/go.mod index e99a098ba2..47b494c461 100644 --- a/go.mod +++ b/go.mod @@ -54,7 +54,7 @@ require ( github.com/hashicorp/hil v0.0.0-20200423225030-a18a1cd20038 github.com/hashicorp/memberlist v0.3.1 github.com/hashicorp/raft v1.3.6 - github.com/hashicorp/raft-autopilot v0.1.5 + github.com/hashicorp/raft-autopilot v0.1.6 github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42 // indirect github.com/hashicorp/raft-boltdb/v2 v2.2.2 github.com/hashicorp/serf v0.9.7 diff --git a/go.sum b/go.sum index bf61a6bf04..fb093ee1ec 100644 --- a/go.sum +++ b/go.sum @@ -363,8 +363,8 @@ github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7 github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8= github.com/hashicorp/raft v1.3.6 h1:v5xW5KzByoerQlN/o31VJrFNiozgzGyDoMgDJgXpsto= github.com/hashicorp/raft v1.3.6/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM= -github.com/hashicorp/raft-autopilot v0.1.5 h1:onEfMH5uHVdXQqtas36zXUHEZxLdsJVu/nXHLcLdL1I= -github.com/hashicorp/raft-autopilot v0.1.5/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw= +github.com/hashicorp/raft-autopilot v0.1.6 h1:C1q3RNF2FfXNZfHWbvVAu0QixaQK8K5pX4O5lh+9z4I= +github.com/hashicorp/raft-autopilot v0.1.6/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk= github.com/hashicorp/raft-boltdb v0.0.0-20210409134258-03c10cc3d4ea/go.mod h1:qRd6nFJYYS6Iqnc/8HcUmko2/2Gw8qTFEmxDLii6W5I= github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42 h1:Ye8SofeDHJzu9xvvaMmpMkqHELWW7rTcXwdUR0CWW48= diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 4f4ef89837..7296ed2081 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -94,7 +94,7 @@ These are some metrics emitted that can help you understand the health of your c | Metric Name | Description | Unit | Type | | :------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------- | :---- | -| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. All non-leader servers will report `NaN`. | health state | gauge | +| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. | health state | gauge | **Why it's important:** Autopilot can expose the overall health of your cluster with a simple boolean. @@ -592,8 +592,8 @@ These metrics give insight into the health of the cluster as a whole. | `consul.serf.member.left` | Increments when an agent leaves the cluster. | leaves / interval | counter | | `consul.serf.events` | Increments when an agent processes an [event](/commands/event). Consul uses events internally so there may be additional events showing in telemetry. There are also a per-event counters emitted as `consul.serf.events.`. | events / interval | counter | | `consul.serf.msgs.sent` | This metric is sample of the number of bytes of messages broadcast to the cluster. In a given time interval, the sum of this metric is the total number of bytes sent and the count is the number of messages sent. | message bytes / interval | counter | -| `consul.autopilot.failure_tolerance` | Tracks the number of voting servers that the cluster can lose while continuing to function. | servers | gauge | -| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. All non-leader servers will report `NaN`. | boolean | gauge | +| `consul.autopilot.failure_tolerance` | Tracks the number of voting servers that the cluster can lose while continuing to function. | servers   | gauge | +| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. | boolean   | gauge | | `consul.session_ttl.active` | Tracks the active number of sessions being tracked. | sessions | gauge | | `consul.catalog.service.query.` | Increments for each catalog query for the given service. | queries | counter | | `consul.catalog.service.query-tag..` | Increments for each catalog query for the given service with the given tag. | queries | counter |