Add ca certificate metrics (#10504)

* add intermediate ca metric routine

* add Gauge config for intermediate cert

* Stop metrics routine when stopping leader

* add changelog entry

* updage changelog

Co-authored-by: Daniel Nephin <dnephin@hashicorp.com>

* use variables instead of a map

* go imports sort

* Add metrics for primary and secondary ca

* start metrics routine in the right DC

* add telemetry documentation

* update docs

* extract expiry fetching in a func

* merge metrics for primary and secondary into signing ca metric

Co-authored-by: Daniel Nephin <dnephin@hashicorp.com>
This commit is contained in:
Dhia Ayachi 2021-07-07 09:41:01 -04:00 committed by GitHub
parent 97831bf3dc
commit 6390e91be5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 89 additions and 15 deletions

3
.changelog/10504.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:enhancement
telemetry: added metrics to track certificates expiry.
```

View File

@ -37,6 +37,7 @@ func (s *Server) startConnectLeader(ctx context.Context) error {
s.caManager.Start(ctx)
s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning)
s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor)
s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).monitor)
return s.startIntentionConfigEntryMigration(ctx)
}
@ -46,6 +47,8 @@ func (s *Server) stopConnectLeader() {
s.caManager.Stop()
s.leaderRoutineManager.Stop(intentionMigrationRoutineName)
s.leaderRoutineManager.Stop(caRootPruningRoutineName)
s.leaderRoutineManager.Stop(caRootMetricRoutineName)
s.leaderRoutineManager.Stop(caSigningMetricRoutineName)
// If the provider implements NeedsStop, we call Stop to perform any shutdown actions.
provider, _ := s.caManager.getCAProvider()

View File

@ -2,25 +2,34 @@ package consul
import (
"context"
"errors"
"fmt"
"time"
"github.com/hashicorp/consul/agent/connect/ca"
"github.com/hashicorp/consul/agent/connect"
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog"
)
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
var CertExpirationGauges = []prometheus.GaugeDefinition{
{
Name: metricsKeyMeshRootCAExpiry,
Help: "Seconds until the service mesh root certificate expires.",
Help: "Seconds until the service mesh root certificate expires. Updated every hour",
},
{
Name: metricsKeyMeshActiveSigningCAExpiry,
Help: "Seconds until the service mesh signing certificate expires. Updated every hour",
},
}
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
func rootCAExpiryMonitor(s *Server) certExpirationMonitor {
return certExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry,
@ -29,20 +38,77 @@ func rootCAExpiryMonitor(s *Server) certExpirationMonitor {
},
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
switch {
case err != nil:
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil:
return 0, fmt.Errorf("no active root CA")
}
return time.Until(root.NotAfter), nil
return getRootCAExpiry(s)
},
}
}
func getRootCAExpiry(s *Server) (time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
switch {
case err != nil:
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil:
return 0, fmt.Errorf("no active root CA")
}
return time.Until(root.NotAfter), nil
}
func signingCAExpiryMonitor(s *Server) certExpirationMonitor {
isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter
if isPrimary {
return certExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
provider, _ := s.caManager.getCAProvider()
if _, ok := provider.(ca.PrimaryUsesIntermediate); !ok {
return getActiveIntermediateExpiry(s)
}
return getRootCAExpiry(s)
},
}
} else {
return certExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
return getActiveIntermediateExpiry(s)
},
}
}
}
func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
if err != nil {
return 0, err
}
// the CA used in a secondary DC is the active intermediate,
// which is the last in the IntermediateCerts stack
if len(root.IntermediateCerts) == 0 {
return 0, errors.New("no intermediate available")
}
cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1])
if err != nil {
return 0, err
}
return time.Until(cert.NotAfter), nil
}
type certExpirationMonitor struct {
Key []string
Labels []metrics.Label

View File

@ -103,6 +103,7 @@ const (
aclUpgradeRoutineName = "legacy ACL token upgrade"
caRootPruningRoutineName = "CA root pruning"
caRootMetricRoutineName = "CA root expiration metric"
caSigningMetricRoutineName = "CA signing expiration metric"
configReplicationRoutineName = "config entry replication"
federationStateReplicationRoutineName = "federation state replication"
federationStateAntiEntropyRoutineName = "federation state anti-entropy"

View File

@ -479,6 +479,7 @@ These metrics give insight into the health of the cluster as a whole.
| `consul.catalog.connect.query-tags..` | Increments for each connect-based catalog query for the given service with the given tags. | queries | counter |
| `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter |
| `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge |
| `consul.mesh.active-signing-ca.expiry` | The number of seconds until the signing CA expires, updated every hour. | seconds | gauge |
## Connect Built-in Proxy Metrics