From 8c575445da79d92eab1e6789f9914098326d90f5 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Wed, 4 Aug 2021 13:05:10 -0400 Subject: [PATCH 1/2] telemetry: add a metric for agent TLS cert expiry --- .changelog/10768.txt | 4 ++ agent/agent.go | 5 ++ agent/consul/leader_connect.go | 4 +- agent/consul/leader_metrics.go | 76 +++++++++++++++++------- website/content/docs/agent/telemetry.mdx | 1 + 5 files changed, 66 insertions(+), 24 deletions(-) create mode 100644 .changelog/10768.txt diff --git a/.changelog/10768.txt b/.changelog/10768.txt new file mode 100644 index 0000000000..0222058561 --- /dev/null +++ b/.changelog/10768.txt @@ -0,0 +1,4 @@ +```release-note:improvement +telemetry: add a new `agent.tls.cert.expiry` metric for tracking when the Agent TLS certificate expires. +``` + diff --git a/agent/agent.go b/agent/agent.go index c13553629c..8dc6c1f0af 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -639,6 +639,11 @@ func (a *Agent) Start(ctx context.Context) error { a.logger.Warn("DEPRECATED Backwards compatibility with pre-1.9 metrics enabled. These metrics will be removed in a future version of Consul. Set `telemetry { disable_compat_1.9 = true }` to disable them.") } + if a.tlsConfigurator.Cert() != nil { + m := consul.AgentTLSCertExpirationMonitor(a.tlsConfigurator, a.logger, a.config.Datacenter) + go m.Monitor(&lib.StopChannelContext{StopCh: a.shutdownCh}) + } + // consul version metric with labels metrics.SetGaugeWithLabels([]string{"version"}, 1, []metrics.Label{ {Name: "version", Value: a.config.Version}, diff --git a/agent/consul/leader_connect.go b/agent/consul/leader_connect.go index e25edf3949..a90194ec50 100644 --- a/agent/consul/leader_connect.go +++ b/agent/consul/leader_connect.go @@ -34,8 +34,8 @@ func (s *Server) startConnectLeader(ctx context.Context) error { s.caManager.Start(ctx) s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning) - s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor) - s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).monitor) + s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).Monitor) + s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).Monitor) return s.startIntentionConfigEntryMigration(ctx) } diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go index 1d40b62937..fb1eaa9cd2 100644 --- a/agent/consul/leader_metrics.go +++ b/agent/consul/leader_metrics.go @@ -2,18 +2,19 @@ package consul import ( "context" + "crypto/x509" "errors" "fmt" "time" - "github.com/hashicorp/consul/agent/connect/ca" - - "github.com/hashicorp/consul/agent/connect" - "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" - "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/connect/ca" + "github.com/hashicorp/consul/logging" + "github.com/hashicorp/consul/tlsutil" ) var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} @@ -28,10 +29,14 @@ var CertExpirationGauges = []prometheus.GaugeDefinition{ Name: metricsKeyMeshActiveSigningCAExpiry, Help: "Seconds until the service mesh signing certificate expires. Updated every hour", }, + { + Name: metricsKeyAgentTLSCertExpiry, + Help: "Seconds until the agent tls certificate expires. Updated every hour", + }, } -func rootCAExpiryMonitor(s *Server) certExpirationMonitor { - return certExpirationMonitor{ +func rootCAExpiryMonitor(s *Server) CertExpirationMonitor { + return CertExpirationMonitor{ Key: metricsKeyMeshRootCAExpiry, Labels: []metrics.Label{ {Name: "datacenter", Value: s.config.Datacenter}, @@ -56,10 +61,10 @@ func getRootCAExpiry(s *Server) (time.Duration, error) { return time.Until(root.NotAfter), nil } -func signingCAExpiryMonitor(s *Server) certExpirationMonitor { +func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter if isPrimary { - return certExpirationMonitor{ + return CertExpirationMonitor{ Key: metricsKeyMeshActiveSigningCAExpiry, Labels: []metrics.Label{ {Name: "datacenter", Value: s.config.Datacenter}, @@ -76,17 +81,17 @@ func signingCAExpiryMonitor(s *Server) certExpirationMonitor { }, } - } else { - return certExpirationMonitor{ - Key: metricsKeyMeshActiveSigningCAExpiry, - Labels: []metrics.Label{ - {Name: "datacenter", Value: s.config.Datacenter}, - }, - Logger: s.logger.Named(logging.Connect), - Query: func() (time.Duration, error) { - return getActiveIntermediateExpiry(s) - }, - } + } + + return CertExpirationMonitor{ + Key: metricsKeyMeshActiveSigningCAExpiry, + Labels: []metrics.Label{ + {Name: "datacenter", Value: s.config.Datacenter}, + }, + Logger: s.logger.Named(logging.Connect), + Query: func() (time.Duration, error) { + return getActiveIntermediateExpiry(s) + }, } } @@ -109,7 +114,7 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) { return time.Until(cert.NotAfter), nil } -type certExpirationMonitor struct { +type CertExpirationMonitor struct { Key []string Labels []metrics.Label Logger hclog.Logger @@ -120,7 +125,7 @@ type certExpirationMonitor struct { const certExpirationMonitorInterval = time.Hour -func (m certExpirationMonitor) monitor(ctx context.Context) error { +func (m CertExpirationMonitor) Monitor(ctx context.Context) error { ticker := time.NewTicker(certExpirationMonitorInterval) defer ticker.Stop() @@ -138,3 +143,30 @@ func (m certExpirationMonitor) monitor(ctx context.Context) error { } } } + +var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"} + +// AgentTLSCertExpirationMonitor returns a CertExpirationMonitor which will +// monitor the expiration of the certificate used for agent TLS. +func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor { + return CertExpirationMonitor{ + Key: metricsKeyAgentTLSCertExpiry, + Labels: []metrics.Label{ + {Name: "node", Value: c.Base().NodeName}, + {Name: "datacenter", Value: dc}, + }, + Logger: logger, + Query: func() (time.Duration, error) { + raw := c.Cert() + if raw == nil { + return 0, fmt.Errorf("tls not enabled") + } + + cert, err := x509.ParseCertificate(raw.Certificate[0]) + if err != nil { + return 0, fmt.Errorf("failed to parse agent tls cert: %w", err) + } + return time.Until(cert.NotAfter), nil + }, + } +} diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index bd1dedc51c..8d24385696 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -480,6 +480,7 @@ These metrics give insight into the health of the cluster as a whole. | `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter | | `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge | | `consul.mesh.active-signing-ca.expiry` | The number of seconds until the signing CA expires, updated every hour. | seconds | gauge | +| `consul.agent.tls.cert.expiry` | The number of seconds until the Agent TLS certificate expires, updated every hour. | seconds | gauge | ## Connect Built-in Proxy Metrics From 9420506faeb29f7b66f4f4f153134ae3a8af16d4 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Wed, 4 Aug 2021 13:26:36 -0400 Subject: [PATCH 2/2] telemetry: fix a couple bugs in cert expiry metrics 1. do not emit the metric if Query fails 2. properly check for PrimaryUsersIntermediate, the logic was inverted Also improve the logging by including the metric name in the log message --- agent/consul/leader_metrics.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go index fb1eaa9cd2..42ac50c37e 100644 --- a/agent/consul/leader_metrics.go +++ b/agent/consul/leader_metrics.go @@ -5,6 +5,7 @@ import ( "crypto/x509" "errors" "fmt" + "strings" "time" "github.com/armon/go-metrics" @@ -73,12 +74,10 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { Query: func() (time.Duration, error) { provider, _ := s.caManager.getCAProvider() - if _, ok := provider.(ca.PrimaryUsesIntermediate); !ok { + if _, ok := provider.(ca.PrimaryUsesIntermediate); ok { return getActiveIntermediateExpiry(s) } - return getRootCAExpiry(s) - }, } } @@ -129,6 +128,8 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error { ticker := time.NewTicker(certExpirationMonitorInterval) defer ticker.Stop() + logger := m.Logger.With("metric", strings.Join(m.Key, ".")) + for { select { case <-ctx.Done(): @@ -136,7 +137,8 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error { case <-ticker.C: d, err := m.Query() if err != nil { - m.Logger.Warn("failed to emit certificate expiry metric", "error", err) + logger.Warn("failed to emit certificate expiry metric", "error", err) + continue } expiry := d / time.Second metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)