From 7fe60e59898bc39a9dfa693d0e3ea76e26c8ae47 Mon Sep 17 00:00:00 2001
From: Daniel Nephin <dnephin@hashicorp.com>
Date: Thu, 5 Aug 2021 18:38:06 -0400
Subject: [PATCH] telemetry: prevent stale values from cert monitors

Prometheus scrapes metrics from each process, so when leadership transfers to a different node
the previous leader would still be reporting the old cached value.

By setting NaN, I believe we should zero-out the value, so that prometheus should only consider the
value from the new leader.
---
 agent/consul/leader_metrics.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go
index 02a6c6d196..bb65d4dafa 100644
--- a/agent/consul/leader_metrics.go
+++ b/agent/consul/leader_metrics.go
@@ -5,6 +5,7 @@ import (
 	"crypto/x509"
 	"errors"
 	"fmt"
+	"math"
 	"strings"
 	"time"
 
@@ -156,6 +157,9 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
 	for {
 		select {
 		case <-ctx.Done():
+			// "Zero-out" the metric on exit so that when prometheus scrapes this
+			// metric from a non-leader, it does not get a stale value.
+			metrics.SetGauge(m.Key, float32(math.NaN()))
 			return nil
 		case <-ticker.C:
 			fn()