From a0bba9171dbe30c8b2cc46e649d26852cccba422 Mon Sep 17 00:00:00 2001 From: FFMMM Date: Fri, 8 Oct 2021 10:31:50 -0700 Subject: [PATCH] fix consul_autopilot_healthy metric emission (#11231) https://github.com/hashicorp/consul/issues/10730 --- .changelog/11231.txt | 3 +++ agent/consul/autopilot.go | 9 +++++++ agent/metrics_test.go | 53 +++++++++++++++++++++++++++++++++++++++ agent/testagent.go | 13 ++++++++-- 4 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 .changelog/11231.txt create mode 100644 agent/metrics_test.go diff --git a/.changelog/11231.txt b/.changelog/11231.txt new file mode 100644 index 0000000000..00996f5031 --- /dev/null +++ b/.changelog/11231.txt @@ -0,0 +1,3 @@ +```release-note:bug +telemetry: fixes a bug with Prometheus consul_autopilot_healthy metric where 0 is reported instead of NaN on servers. +``` \ No newline at end of file diff --git a/agent/consul/autopilot.go b/agent/consul/autopilot.go index 5033032c84..93f8566fb8 100644 --- a/agent/consul/autopilot.go +++ b/agent/consul/autopilot.go @@ -9,6 +9,7 @@ import ( "github.com/hashicorp/raft" autopilot "github.com/hashicorp/raft-autopilot" "github.com/hashicorp/serf/serf" + "math" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" @@ -52,6 +53,12 @@ func (d *AutopilotDelegate) NotifyState(state *autopilot.State) { } else { metrics.SetGauge([]string{"autopilot", "healthy"}, 0) } + } else { + + // if we are not a leader, emit NaN per + // https://www.consul.io/docs/agent/telemetry#autopilot + metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN())) + } } @@ -74,6 +81,8 @@ func (s *Server) initAutopilot(config *Config) { autopilot.WithUpdateInterval(config.ServerHealthInterval), autopilot.WithPromoter(s.autopilotPromoter()), ) + + metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN())) } func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server { diff --git a/agent/metrics_test.go b/agent/metrics_test.go new file mode 100644 index 0000000000..326163b709 --- /dev/null +++ b/agent/metrics_test.go @@ -0,0 +1,53 @@ +package agent + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +// TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus adds testing around +// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot +func TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus(t *testing.T) { + checkForShortTesting(t) + // This test cannot use t.Parallel() since we modify global state, ie the global metrics instance + + // don't bootstrap agent so as not to + // become a leader + hcl := ` + telemetry = { + prometheus_retention_time = "5s", + disable_hostname = true + } + bootstrap = false + ` + + a := StartTestAgent(t, TestAgent{HCL: hcl}) + defer a.Shutdown() + + req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil) + if err != nil { + t.Fatalf("Failed to generate new http request. err: %v", err) + } + + respRec := httptest.NewRecorder() + _, err = a.srv.AgentMetrics(respRec, req) + if err != nil { + t.Fatalf("Failed to serve agent metrics. err: %v", err) + } + + t.Run("Check consul_autopilot_healthy metric value on startup", func(t *testing.T) { + target := "consul_autopilot_healthy NaN" + keyValue := strings.Split(target, " ") + if !strings.Contains(respRec.Body.String(), target) { + t.Fatalf("Could not find the metric \"%s\" with value \"%s\" in the /v1/agent/metrics response", keyValue[0], keyValue[1]) + } + }) +} + +func checkForShortTesting(t *testing.T) { + if testing.Short() { + t.Skip("too slow for testing.Short") + } +} diff --git a/agent/testagent.go b/agent/testagent.go index ee1f7f510f..b421f3cec6 100644 --- a/agent/testagent.go +++ b/agent/testagent.go @@ -185,7 +185,12 @@ func (a *TestAgent) Start(t *testing.T) error { } result, err := config.Load(opts) if result.RuntimeConfig != nil { - result.RuntimeConfig.Telemetry.Disable = true + // If prom metrics need to be enabled, do not disable telemetry + if result.RuntimeConfig.Telemetry.PrometheusOpts.Expiration > 0 { + result.RuntimeConfig.Telemetry.Disable = false + } else { + result.RuntimeConfig.Telemetry.Disable = true + } } return result, err } @@ -195,7 +200,11 @@ func (a *TestAgent) Start(t *testing.T) error { } bd.Logger = logger - bd.MetricsHandler = metrics.NewInmemSink(1*time.Second, time.Minute) + // if we are not testing telemetry things, let's use a "mock" sink for metrics + if bd.RuntimeConfig.Telemetry.Disable { + bd.MetricsHandler = metrics.NewInmemSink(1*time.Second, time.Minute) + } + a.Config = bd.RuntimeConfig agent, err := New(bd)