telemetry: fix cert expiry metrics by removing labels

These labels should be set by whatever process scrapes Consul (for
prometheus), or by the agent that receives them (for datadog/statsd).

We need to remove them here because the labels are part of the "metric
key", so we'd have to pre-declare the metrics with the labels. We could
do that, but that is extra work for labels that should be added from
elsewhere.

Also renames the closure to be more descriptive.
This commit is contained in:
Daniel Nephin 2021-10-20 11:54:11 -04:00
parent 7948720bbb
commit 9264ce89d2

View File

@ -43,9 +43,6 @@ var AgentCertExpirationGauges = []prometheus.GaugeDefinition{
func rootCAExpiryMonitor(s *Server) CertExpirationMonitor { func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{ return CertExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry, Key: metricsKeyMeshRootCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect), Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) { Query: func() (time.Duration, error) {
return getRootCAExpiry(s) return getRootCAExpiry(s)
@ -71,9 +68,6 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
if isPrimary { if isPrimary {
return CertExpirationMonitor{ return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry, Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect), Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) { Query: func() (time.Duration, error) {
provider, _ := s.caManager.getCAProvider() provider, _ := s.caManager.getCAProvider()
@ -88,9 +82,6 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{ return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry, Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect), Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) { Query: func() (time.Duration, error) {
return getActiveIntermediateExpiry(s) return getActiveIntermediateExpiry(s)
@ -122,6 +113,10 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
type CertExpirationMonitor struct { type CertExpirationMonitor struct {
Key []string Key []string
// Labels to be emitted along with the metric. It is very important that these
// labels be included in the pre-declaration as well. Otherwise, if
// telemetry.prometheus_retention_time is less than certExpirationMonitorInterval
// then the metrics will expire before they are emitted again.
Labels []metrics.Label Labels []metrics.Label
Logger hclog.Logger Logger hclog.Logger
// Query is called at each interval. It should return the duration until the // Query is called at each interval. It should return the duration until the
@ -137,7 +132,7 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
logger := m.Logger.With("metric", strings.Join(m.Key, ".")) logger := m.Logger.With("metric", strings.Join(m.Key, "."))
fn := func() { emitMetric := func() {
d, err := m.Query() d, err := m.Query()
if err != nil { if err != nil {
logger.Warn("failed to emit certificate expiry metric", "error", err) logger.Warn("failed to emit certificate expiry metric", "error", err)
@ -155,17 +150,17 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
// emit the metric immediately so that if a cert was just updated the // emit the metric immediately so that if a cert was just updated the
// new metric will be updated to the new expiration time. // new metric will be updated to the new expiration time.
fn() emitMetric()
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
// "Zero-out" the metric on exit so that when prometheus scrapes this // "Zero-out" the metric on exit so that when prometheus scrapes this
// metric from a non-leader, it does not get a stale value. // metric from a non-leader, it does not get a stale value.
metrics.SetGauge(m.Key, float32(math.NaN())) metrics.SetGaugeWithLabels(m.Key, float32(math.NaN()), m.Labels)
return nil return nil
case <-ticker.C: case <-ticker.C:
fn() emitMetric()
} }
} }
} }
@ -177,10 +172,6 @@ var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}
func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor { func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor {
return CertExpirationMonitor{ return CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry, Key: metricsKeyAgentTLSCertExpiry,
Labels: []metrics.Label{
{Name: "node", Value: c.Base().NodeName},
{Name: "datacenter", Value: dc},
},
Logger: logger, Logger: logger,
Query: func() (time.Duration, error) { Query: func() (time.Duration, error) {
raw := c.Cert() raw := c.Cert()