diff --git a/.changelog/9924.txt b/.changelog/9924.txt new file mode 100644 index 0000000000..5a5777acc3 --- /dev/null +++ b/.changelog/9924.txt @@ -0,0 +1,4 @@ +```release-note:improvement +telemetry: add a new `mesh.root-ca.expiry` metric for tracking when the root certificate expires. +``` + diff --git a/agent/consul/leader_connect.go b/agent/consul/leader_connect.go index 1b724d2302..5f662e8b9c 100644 --- a/agent/consul/leader_connect.go +++ b/agent/consul/leader_connect.go @@ -36,6 +36,7 @@ func (s *Server) startConnectLeader(ctx context.Context) error { s.caManager.Start(ctx) s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning) + s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor) return s.startIntentionConfigEntryMigration(ctx) } diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go new file mode 100644 index 0000000000..63a4dd1c71 --- /dev/null +++ b/agent/consul/leader_metrics.go @@ -0,0 +1,71 @@ +package consul + +import ( + "context" + "fmt" + "time" + + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" + "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/consul/logging" +) + +var CertExpirationGauges = []prometheus.GaugeDefinition{ + { + Name: metricsKeyMeshRootCAExpiry, + Help: "Seconds until the service mesh root certificate expires.", + }, +} + +var metricsKeyMeshRootCAExpiry = []string{"mesh", "root-ca", "expiry"} + +func rootCAExpiryMonitor(s *Server) certExpirationMonitor { + return certExpirationMonitor{ + Key: metricsKeyMeshRootCAExpiry, + Labels: []metrics.Label{ + {Name: "datacenter", Value: s.config.Datacenter}, + }, + Logger: s.logger.Named(logging.Connect), + Query: func() (time.Duration, error) { + state := s.fsm.State() + _, root, err := state.CARootActive(nil) + if err != nil { + return 0, fmt.Errorf("failed to retrieve root CA: %w", err) + } + + return time.Until(root.NotAfter), nil + }, + } +} + +type certExpirationMonitor struct { + Key []string + Labels []metrics.Label + Logger hclog.Logger + // Query is called at each interval. It should return the duration until the + // certificate expires, or an error if the query failed. + Query func() (time.Duration, error) +} + +const certExpirationMonitorInterval = time.Hour + +func (m certExpirationMonitor) monitor(ctx context.Context) error { + ticker := time.NewTicker(certExpirationMonitorInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + d, err := m.Query() + if err != nil { + m.Logger.Warn("failed to emit certificate expiry metric", "error", err) + } + expiry := d / time.Second + metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels) + } + } +} diff --git a/agent/consul/server.go b/agent/consul/server.go index 08677a6b17..e5e4ecb371 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -102,6 +102,7 @@ const ( aclTokenReapingRoutineName = "acl token reaping" aclUpgradeRoutineName = "legacy ACL token upgrade" caRootPruningRoutineName = "CA root pruning" + caRootMetricRoutineName = "CA root expiration metric" configReplicationRoutineName = "config entry replication" federationStateReplicationRoutineName = "federation state replication" federationStateAntiEntropyRoutineName = "federation state anti-entropy" diff --git a/agent/setup.go b/agent/setup.go index bfa4abfade..7b363cd86b 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -194,6 +194,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig) ([]prometheus.GaugeDefinition, [ xds.StatsGauges, usagemetrics.Gauges, consul.ReplicationGauges, + consul.CertExpirationGauges, Gauges, raftGauges, } diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 50dfcbd9c2..9bca6fff8d 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -478,6 +478,7 @@ These metrics give insight into the health of the cluster as a whole. | `consul.catalog.connect.query-tag..` | Increments for each connect-based catalog query for the given service with the given tag. | queries | counter | | `consul.catalog.connect.query-tags..` | Increments for each connect-based catalog query for the given service with the given tags. | queries | counter | | `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter | +| `consul.mesh.root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge | ## Connect Built-in Proxy Metrics