From fea6f08bf9995bc65c3bc0e5206974cac62d35fc Mon Sep 17 00:00:00 2001 From: FFMMM Date: Mon, 25 Oct 2021 10:55:59 -0700 Subject: [PATCH] fix autopilot_failure_tolerance, add autopilot metrics test case (#11399) Signed-off-by: FFMMM --- .changelog/11399.txt | 3 + agent/consul/autopilot.go | 4 ++ agent/metrics_test.go | 122 ++++++++++++++++++++++++++------------ 3 files changed, 90 insertions(+), 39 deletions(-) create mode 100644 .changelog/11399.txt diff --git a/.changelog/11399.txt b/.changelog/11399.txt new file mode 100644 index 0000000000..dc61fe3949 --- /dev/null +++ b/.changelog/11399.txt @@ -0,0 +1,3 @@ +```release-note:bug +telemetry: fixes a bug with Prometheus consul_autopilot_failure_tolerance metric where 0 is reported instead of NaN on follower servers. +``` \ No newline at end of file diff --git a/agent/consul/autopilot.go b/agent/consul/autopilot.go index 93f8566fb8..22663f00be 100644 --- a/agent/consul/autopilot.go +++ b/agent/consul/autopilot.go @@ -59,6 +59,9 @@ func (d *AutopilotDelegate) NotifyState(state *autopilot.State) { // https://www.consul.io/docs/agent/telemetry#autopilot metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN())) + // also emit NaN for failure tolerance to be backwards compatible + metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN())) + } } @@ -83,6 +86,7 @@ func (s *Server) initAutopilot(config *Config) { ) metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN())) + metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN())) } func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server { diff --git a/agent/metrics_test.go b/agent/metrics_test.go index 326163b709..f946e469af 100644 --- a/agent/metrics_test.go +++ b/agent/metrics_test.go @@ -1,53 +1,97 @@ package agent import ( + "github.com/stretchr/testify/require" "net/http" "net/http/httptest" "strings" "testing" ) -// TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus adds testing around -// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot -func TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus(t *testing.T) { - checkForShortTesting(t) - // This test cannot use t.Parallel() since we modify global state, ie the global metrics instance - - // don't bootstrap agent so as not to - // become a leader - hcl := ` - telemetry = { - prometheus_retention_time = "5s", - disable_hostname = true - } - bootstrap = false - ` - - a := StartTestAgent(t, TestAgent{HCL: hcl}) - defer a.Shutdown() - - req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil) - if err != nil { - t.Fatalf("Failed to generate new http request. err: %v", err) - } - - respRec := httptest.NewRecorder() - _, err = a.srv.AgentMetrics(respRec, req) - if err != nil { - t.Fatalf("Failed to serve agent metrics. err: %v", err) - } - - t.Run("Check consul_autopilot_healthy metric value on startup", func(t *testing.T) { - target := "consul_autopilot_healthy NaN" - keyValue := strings.Split(target, " ") - if !strings.Contains(respRec.Body.String(), target) { - t.Fatalf("Could not find the metric \"%s\" with value \"%s\" in the /v1/agent/metrics response", keyValue[0], keyValue[1]) - } - }) -} - func checkForShortTesting(t *testing.T) { if testing.Short() { t.Skip("too slow for testing.Short") } } + +func recordPromMetrics(t *testing.T, a *TestAgent, respRec *httptest.ResponseRecorder) { + req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil) + require.NoError(t, err, "Failed to generate new http request.") + + _, err = a.srv.AgentMetrics(respRec, req) + require.NoError(t, err, "Failed to serve agent metrics") + +} + +func assertMetricExistsWithValue(t *testing.T, respRec *httptest.ResponseRecorder, metric string, value string) { + if respRec.Body.String() == "" { + t.Fatalf("Response body is empty.") + } + + // eg "consul_autopilot_healthy NaN" + target := metric + " " + value + + if !strings.Contains(respRec.Body.String(), target) { + t.Fatalf("Could not find the metric \"%s\" with value \"%s\" in the /v1/agent/metrics response", metric, value) + } +} + +func assertMetricNotExists(t *testing.T, respRec *httptest.ResponseRecorder, metric string) { + if respRec.Body.String() == "" { + t.Fatalf("Response body is empty.") + } + + if strings.Contains(respRec.Body.String(), metric) { + t.Fatalf("Didn't expect to find the metric \"%s\" in the /v1/agent/metrics response", metric) + } +} + +// TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus adds testing around +// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot +func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) { + checkForShortTesting(t) + // This test cannot use t.Parallel() since we modify global state, ie the global metrics instance + + t.Run("Check consul_autopilot_* are not emitted metrics on clients", func(t *testing.T) { + hcl := ` + telemetry = { + prometheus_retention_time = "5s" + disable_hostname = true + metrics_prefix = "agent_1" + } + bootstrap = false + server = false + ` + + a := StartTestAgent(t, TestAgent{HCL: hcl}) + defer a.Shutdown() + + respRec := httptest.NewRecorder() + recordPromMetrics(t, a, respRec) + + assertMetricNotExists(t, respRec, "agent_1_autopilot_healthy") + assertMetricNotExists(t, respRec, "agent_1_autopilot_failure_tolerance") + }) + + t.Run("Check consul_autopilot_healthy metric value on startup", func(t *testing.T) { + // don't bootstrap agent so as not to + // become a leader + hcl := ` + telemetry = { + prometheus_retention_time = "5s", + disable_hostname = true + metrics_prefix = "agent_2" + } + bootstrap = false + ` + + a := StartTestAgent(t, TestAgent{HCL: hcl}) + defer a.Shutdown() + + respRec := httptest.NewRecorder() + recordPromMetrics(t, a, respRec) + + assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_healthy", "NaN") + assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "NaN") + }) +}