log warning about certificate expiring sooner and with more details

The old setting of 24 hours was not enough time to deal with an expiring certificates. This change ups it to 28 days OR 40% of the full cert duration, whichever is shorter. It also adds details to the log message to indicate which certificate it is logging about and a suggested action.
This commit is contained in:
John Eikenberry 2023-04-07 20:38:07 +00:00 committed by GitHub
parent d9c02c5761
commit 97173725b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 107 additions and 26 deletions

View File

@ -19,8 +19,10 @@ import (
"github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging"
) )
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} var (
var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"} metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
)
var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{ var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{
{ {
@ -37,30 +39,31 @@ func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{ return CertExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry, Key: metricsKeyMeshRootCAExpiry,
Logger: s.logger.Named(logging.Connect), Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) { Query: func() (time.Duration, time.Duration, error) {
return getRootCAExpiry(s) return getRootCAExpiry(s)
}, },
} }
} }
func getRootCAExpiry(s *Server) (time.Duration, error) { func getRootCAExpiry(s *Server) (time.Duration, time.Duration, error) {
state := s.fsm.State() state := s.fsm.State()
_, root, err := state.CARootActive(nil) _, root, err := state.CARootActive(nil)
switch { switch {
case err != nil: case err != nil:
return 0, fmt.Errorf("failed to retrieve root CA: %w", err) return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil: case root == nil:
return 0, fmt.Errorf("no active root CA") return 0, 0, fmt.Errorf("no active root CA")
} }
return time.Until(root.NotAfter), nil lifetime := time.Since(root.NotBefore) + time.Until(root.NotAfter)
return lifetime, time.Until(root.NotAfter), nil
} }
func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{ return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry, Key: metricsKeyMeshActiveSigningCAExpiry,
Logger: s.logger.Named(logging.Connect), Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) { Query: func() (time.Duration, time.Duration, error) {
if s.caManager.isIntermediateUsedToSignLeaf() { if s.caManager.isIntermediateUsedToSignLeaf() {
return getActiveIntermediateExpiry(s) return getActiveIntermediateExpiry(s)
} }
@ -69,26 +72,28 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
} }
} }
func getActiveIntermediateExpiry(s *Server) (time.Duration, error) { func getActiveIntermediateExpiry(s *Server) (time.Duration, time.Duration, error) {
state := s.fsm.State() state := s.fsm.State()
_, root, err := state.CARootActive(nil) _, root, err := state.CARootActive(nil)
switch { switch {
case err != nil: case err != nil:
return 0, fmt.Errorf("failed to retrieve root CA: %w", err) return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil: case root == nil:
return 0, fmt.Errorf("no active root CA") return 0, 0, fmt.Errorf("no active root CA")
} }
// the CA used in a secondary DC is the active intermediate, // the CA used in a secondary DC is the active intermediate,
// which is the last in the IntermediateCerts stack // which is the last in the IntermediateCerts stack
if len(root.IntermediateCerts) == 0 { if len(root.IntermediateCerts) == 0 {
return 0, errors.New("no intermediate available") return 0, 0, errors.New("no intermediate available")
} }
cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1]) cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1])
if err != nil { if err != nil {
return 0, err return 0, 0, err
} }
return time.Until(cert.NotAfter), nil
lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter)
return lifetime, time.Until(cert.NotAfter), nil
} }
type CertExpirationMonitor struct { type CertExpirationMonitor struct {
@ -99,9 +104,11 @@ type CertExpirationMonitor struct {
// then the metrics will expire before they are emitted again. // then the metrics will expire before they are emitted again.
Labels []metrics.Label Labels []metrics.Label
Logger hclog.Logger Logger hclog.Logger
// Query is called at each interval. It should return the duration until the // Query is called at each interval. It should return 2 durations, the full
// certificate expires, or an error if the query failed. // lifespan of the certificate (NotBefore -> NotAfter) and the duration
Query func() (time.Duration, error) // until the certificate expires (Now -> NotAfter), or an error if the
// query failed.
Query func() (time.Duration, time.Duration, error)
} }
const certExpirationMonitorInterval = time.Hour const certExpirationMonitorInterval = time.Hour
@ -113,18 +120,37 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
logger := m.Logger.With("metric", strings.Join(m.Key, ".")) logger := m.Logger.With("metric", strings.Join(m.Key, "."))
emitMetric := func() { emitMetric := func() {
d, err := m.Query() lifetime, untilAfter, err := m.Query()
if err != nil { if err != nil {
logger.Warn("failed to emit certificate expiry metric", "error", err) logger.Warn("failed to emit certificate expiry metric", "error", err)
return return
} }
if d < 24*time.Hour { if expiresSoon(lifetime, untilAfter) {
logger.Warn("certificate will expire soon", key := strings.Join(m.Key, ":")
"time_to_expiry", d, "expiration", time.Now().Add(d)) switch key {
case "mesh:active-root-ca:expiry":
logger.Warn("root certificate will expire soon",
"time_to_expiry", untilAfter,
"expiration", time.Now().Add(untilAfter),
"suggested_action", "manually rotate the root certificate",
)
case "mesh:active-signing-ca:expiry":
logger.Warn("signing (intermediate) certificate will expire soon",
"time_to_expiry", untilAfter,
"expiration", time.Now().Add(untilAfter),
"suggested_action", "check consul logs for rotation issues",
)
case "agent:tls:cert:expiry":
logger.Warn("agent TLS certificate will expire soon",
"time_to_expiry", untilAfter,
"expiration", time.Now().Add(untilAfter),
"suggested_action", "manually rotate this agent's certificate",
)
}
} }
expiry := d / time.Second expiry := untilAfter / time.Second
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels) metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
} }
@ -153,3 +179,19 @@ func initLeaderMetrics() {
metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels) metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels)
} }
} }
// expiresSoon checks to see if we are close enough to the cert expiring that
// we should send out a WARN log message.
// It returns true if the cert will expire within 28 days or 40% of the
// certificate's total duration (whichever is shorter).
func expiresSoon(lifetime, untilAfter time.Duration) bool {
defaultPeriod := 28 * (24 * time.Hour) // 28 days
fortyPercent := (lifetime / 10) * 4 // 40% of total duration
warningPeriod := defaultPeriod
if fortyPercent < defaultPeriod {
warningPeriod = fortyPercent
}
return untilAfter < warningPeriod
}

View File

@ -0,0 +1,37 @@
package consul
import (
"testing"
"time"
)
const (
day = time.Hour * 24
year = day * 365
)
func TestExpiresSoon(t *testing.T) {
// ExpiresSoon() should return true if 'untilAfter' is <= 28 days
// OR if 40% of lifetime if it is less than 28 days
testCases := []struct {
name string
lifetime, untilAfter time.Duration
expiresSoon bool
}{
{name: "base-pass", lifetime: year, untilAfter: year, expiresSoon: false},
{name: "base-expire", lifetime: year, untilAfter: (day * 27), expiresSoon: true},
{name: "expires", lifetime: (day * 70), untilAfter: (day * 20), expiresSoon: true},
{name: "passes", lifetime: (day * 70), untilAfter: (day * 50), expiresSoon: false},
{name: "just-expires", lifetime: (day * 70), untilAfter: (day * 27), expiresSoon: true},
{name: "just-passes", lifetime: (day * 70), untilAfter: (day * 43), expiresSoon: false},
{name: "40%-expire", lifetime: (day * 30), untilAfter: (day * 10), expiresSoon: true},
{name: "40%-pass", lifetime: (day * 30), untilAfter: (day * 12), expiresSoon: false},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if expiresSoon(tc.lifetime, tc.untilAfter) != tc.expiresSoon {
t.Errorf("test case failed, should return `%t`", tc.expiresSoon)
}
})
}
}

View File

@ -30,17 +30,19 @@ func tlsCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger) cons
return consul.CertExpirationMonitor{ return consul.CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry, Key: metricsKeyAgentTLSCertExpiry,
Logger: logger, Logger: logger,
Query: func() (time.Duration, error) { Query: func() (time.Duration, time.Duration, error) {
raw := c.Cert() raw := c.Cert()
if raw == nil { if raw == nil {
return 0, fmt.Errorf("tls not enabled") return 0, 0, fmt.Errorf("tls not enabled")
} }
cert, err := x509.ParseCertificate(raw.Certificate[0]) cert, err := x509.ParseCertificate(raw.Certificate[0])
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err) return 0, 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
} }
return time.Until(cert.NotAfter), nil
lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter)
return lifetime, time.Until(cert.NotAfter), nil
}, },
} }
} }