telemetry: fix cert expiry metrics by removing labels

These labels should be set by whatever process scrapes Consul (for
prometheus), or by the agent that receives them (for datadog/statsd).

We need to remove them here because the labels are part of the "metric
key", so we'd have to pre-declare the metrics with the labels. We could
do that, but that is extra work for labels that should be added from
elsewhere.

Also renames the closure to be more descriptive.
This commit is contained in:
Daniel Nephin 2021-10-20 11:54:11 -04:00
parent 7948720bbb
commit 9264ce89d2
1 changed files with 13 additions and 22 deletions

View File

@ -42,10 +42,7 @@ var AgentCertExpirationGauges = []prometheus.GaugeDefinition{
func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Key: metricsKeyMeshRootCAExpiry,
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
return getRootCAExpiry(s)
@ -70,10 +67,7 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter
if isPrimary {
return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Key: metricsKeyMeshActiveSigningCAExpiry,
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
provider, _ := s.caManager.getCAProvider()
@ -87,10 +81,7 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
}
return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Key: metricsKeyMeshActiveSigningCAExpiry,
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
return getActiveIntermediateExpiry(s)
@ -121,7 +112,11 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
}
type CertExpirationMonitor struct {
Key []string
Key []string
// Labels to be emitted along with the metric. It is very important that these
// labels be included in the pre-declaration as well. Otherwise, if
// telemetry.prometheus_retention_time is less than certExpirationMonitorInterval
// then the metrics will expire before they are emitted again.
Labels []metrics.Label
Logger hclog.Logger
// Query is called at each interval. It should return the duration until the
@ -137,7 +132,7 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
logger := m.Logger.With("metric", strings.Join(m.Key, "."))
fn := func() {
emitMetric := func() {
d, err := m.Query()
if err != nil {
logger.Warn("failed to emit certificate expiry metric", "error", err)
@ -155,17 +150,17 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
// emit the metric immediately so that if a cert was just updated the
// new metric will be updated to the new expiration time.
fn()
emitMetric()
for {
select {
case <-ctx.Done():
// "Zero-out" the metric on exit so that when prometheus scrapes this
// metric from a non-leader, it does not get a stale value.
metrics.SetGauge(m.Key, float32(math.NaN()))
metrics.SetGaugeWithLabels(m.Key, float32(math.NaN()), m.Labels)
return nil
case <-ticker.C:
fn()
emitMetric()
}
}
}
@ -176,11 +171,7 @@ var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}
// monitor the expiration of the certificate used for agent TLS.
func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry,
Labels: []metrics.Label{
{Name: "node", Value: c.Base().NodeName},
{Name: "datacenter", Value: dc},
},
Key: metricsKeyAgentTLSCertExpiry,
Logger: logger,
Query: func() (time.Duration, error) {
raw := c.Cert()