consul/agent/consul/leader_metrics.go
hashicorp-copywrite[bot] 5fb9df1640
[COMPLIANCE] License changes (#18443)
* Adding explicit MPL license for sub-package

This directory and its subdirectories (packages) contain files licensed with the MPLv2 `LICENSE` file in this directory and are intentionally licensed separately from the BSL `LICENSE` file at the root of this repository.

* Adding explicit MPL license for sub-package

This directory and its subdirectories (packages) contain files licensed with the MPLv2 `LICENSE` file in this directory and are intentionally licensed separately from the BSL `LICENSE` file at the root of this repository.

* Updating the license from MPL to Business Source License

Going forward, this project will be licensed under the Business Source License v1.1. Please see our blog post for more details at <Blog URL>, FAQ at www.hashicorp.com/licensing-faq, and details of the license at www.hashicorp.com/bsl.

* add missing license headers

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

* Update copyright file headers to BUSL-1.1

---------

Co-authored-by: hashicorp-copywrite[bot] <110428419+hashicorp-copywrite[bot]@users.noreply.github.com>
2023-08-11 09:12:13 -04:00

198 lines
6.0 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package consul
import (
"context"
"errors"
"fmt"
"math"
"strings"
"time"
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/logging"
)
var (
metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
)
var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{
{
Name: metricsKeyMeshRootCAExpiry,
Help: "Seconds until the service mesh root certificate expires. Updated every hour",
},
{
Name: metricsKeyMeshActiveSigningCAExpiry,
Help: "Seconds until the service mesh signing certificate expires. Updated every hour",
},
}
func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry,
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, time.Duration, error) {
return getRootCAExpiry(s)
},
}
}
func getRootCAExpiry(s *Server) (time.Duration, time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
switch {
case err != nil:
return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil:
return 0, 0, fmt.Errorf("no active root CA")
}
lifetime := time.Since(root.NotBefore) + time.Until(root.NotAfter)
return lifetime, time.Until(root.NotAfter), nil
}
func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, time.Duration, error) {
if s.caManager.isIntermediateUsedToSignLeaf() {
return getActiveIntermediateExpiry(s)
}
return getRootCAExpiry(s)
},
}
}
func getActiveIntermediateExpiry(s *Server) (time.Duration, time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
switch {
case err != nil:
return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil:
return 0, 0, fmt.Errorf("no active root CA")
}
// the CA used in a secondary DC is the active intermediate,
// which is the last in the IntermediateCerts stack
if len(root.IntermediateCerts) == 0 {
return 0, 0, errors.New("no intermediate available")
}
cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1])
if err != nil {
return 0, 0, err
}
lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter)
return lifetime, time.Until(cert.NotAfter), nil
}
type CertExpirationMonitor struct {
Key []string
// Labels to be emitted along with the metric. It is very important that these
// labels be included in the pre-declaration as well. Otherwise, if
// telemetry.prometheus_retention_time is less than certExpirationMonitorInterval
// then the metrics will expire before they are emitted again.
Labels []metrics.Label
Logger hclog.Logger
// Query is called at each interval. It should return 2 durations, the full
// lifespan of the certificate (NotBefore -> NotAfter) and the duration
// until the certificate expires (Now -> NotAfter), or an error if the
// query failed.
Query func() (time.Duration, time.Duration, error)
}
const certExpirationMonitorInterval = time.Hour
func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
ticker := time.NewTicker(certExpirationMonitorInterval)
defer ticker.Stop()
logger := m.Logger.With("metric", strings.Join(m.Key, "."))
emitMetric := func() {
lifetime, untilAfter, err := m.Query()
if err != nil {
logger.Warn("failed to emit certificate expiry metric", "error", err)
return
}
if expiresSoon(lifetime, untilAfter) {
key := strings.Join(m.Key, ":")
switch key {
case "mesh:active-root-ca:expiry":
logger.Warn("root certificate will expire soon",
"time_to_expiry", untilAfter,
"expiration", time.Now().Add(untilAfter),
"suggested_action", "manually rotate the root certificate",
)
case "mesh:active-signing-ca:expiry":
logger.Warn("signing (intermediate) certificate will expire soon",
"time_to_expiry", untilAfter,
"expiration", time.Now().Add(untilAfter),
"suggested_action", "check consul logs for rotation issues",
)
case "agent:tls:cert:expiry":
logger.Warn("agent TLS certificate will expire soon",
"time_to_expiry", untilAfter,
"expiration", time.Now().Add(untilAfter),
"suggested_action", "manually rotate this agent's certificate",
)
}
}
expiry := untilAfter / time.Second
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
}
// emit the metric immediately so that if a cert was just updated the
// new metric will be updated to the new expiration time.
emitMetric()
for {
select {
case <-ctx.Done():
// "Zero-out" the metric on exit so that when prometheus scrapes this
// metric from a non-leader, it does not get a stale value.
metrics.SetGaugeWithLabels(m.Key, float32(math.NaN()), m.Labels)
return nil
case <-ticker.C:
emitMetric()
}
}
}
// initLeaderMetrics sets all metrics that are emitted only on leaders to a NaN
// value so that they don't incorrectly report 0 when a server starts as a
// follower.
func initLeaderMetrics() {
for _, g := range LeaderCertExpirationGauges {
metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels)
}
}
// expiresSoon checks to see if we are close enough to the cert expiring that
// we should send out a WARN log message.
// It returns true if the cert will expire within 28 days or 40% of the
// certificate's total duration (whichever is shorter).
func expiresSoon(lifetime, untilAfter time.Duration) bool {
defaultPeriod := 28 * (24 * time.Hour) // 28 days
fortyPercent := (lifetime / 10) * 4 // 40% of total duration
warningPeriod := defaultPeriod
if fortyPercent < defaultPeriod {
warningPeriod = fortyPercent
}
return untilAfter < warningPeriod
}