mirror of https://github.com/status-im/consul.git
log warning about certificate expiring sooner and with more details
The old setting of 24 hours was not enough time to deal with an expiring certificates. This change ups it to 28 days OR 40% of the full cert duration, whichever is shorter. It also adds details to the log message to indicate which certificate it is logging about and a suggested action.
This commit is contained in:
parent
d9c02c5761
commit
97173725b7
|
@ -19,8 +19,10 @@ import (
|
|||
"github.com/hashicorp/consul/logging"
|
||||
)
|
||||
|
||||
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
|
||||
var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
|
||||
var (
|
||||
metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
|
||||
metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
|
||||
)
|
||||
|
||||
var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{
|
||||
{
|
||||
|
@ -37,30 +39,31 @@ func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
|
|||
return CertExpirationMonitor{
|
||||
Key: metricsKeyMeshRootCAExpiry,
|
||||
Logger: s.logger.Named(logging.Connect),
|
||||
Query: func() (time.Duration, error) {
|
||||
Query: func() (time.Duration, time.Duration, error) {
|
||||
return getRootCAExpiry(s)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func getRootCAExpiry(s *Server) (time.Duration, error) {
|
||||
func getRootCAExpiry(s *Server) (time.Duration, time.Duration, error) {
|
||||
state := s.fsm.State()
|
||||
_, root, err := state.CARootActive(nil)
|
||||
switch {
|
||||
case err != nil:
|
||||
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
||||
return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
||||
case root == nil:
|
||||
return 0, fmt.Errorf("no active root CA")
|
||||
return 0, 0, fmt.Errorf("no active root CA")
|
||||
}
|
||||
|
||||
return time.Until(root.NotAfter), nil
|
||||
lifetime := time.Since(root.NotBefore) + time.Until(root.NotAfter)
|
||||
return lifetime, time.Until(root.NotAfter), nil
|
||||
}
|
||||
|
||||
func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
|
||||
return CertExpirationMonitor{
|
||||
Key: metricsKeyMeshActiveSigningCAExpiry,
|
||||
Logger: s.logger.Named(logging.Connect),
|
||||
Query: func() (time.Duration, error) {
|
||||
Query: func() (time.Duration, time.Duration, error) {
|
||||
if s.caManager.isIntermediateUsedToSignLeaf() {
|
||||
return getActiveIntermediateExpiry(s)
|
||||
}
|
||||
|
@ -69,26 +72,28 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
|
|||
}
|
||||
}
|
||||
|
||||
func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
|
||||
func getActiveIntermediateExpiry(s *Server) (time.Duration, time.Duration, error) {
|
||||
state := s.fsm.State()
|
||||
_, root, err := state.CARootActive(nil)
|
||||
switch {
|
||||
case err != nil:
|
||||
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
||||
return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
||||
case root == nil:
|
||||
return 0, fmt.Errorf("no active root CA")
|
||||
return 0, 0, fmt.Errorf("no active root CA")
|
||||
}
|
||||
|
||||
// the CA used in a secondary DC is the active intermediate,
|
||||
// which is the last in the IntermediateCerts stack
|
||||
if len(root.IntermediateCerts) == 0 {
|
||||
return 0, errors.New("no intermediate available")
|
||||
return 0, 0, errors.New("no intermediate available")
|
||||
}
|
||||
cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
return 0, 0, err
|
||||
}
|
||||
return time.Until(cert.NotAfter), nil
|
||||
|
||||
lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter)
|
||||
return lifetime, time.Until(cert.NotAfter), nil
|
||||
}
|
||||
|
||||
type CertExpirationMonitor struct {
|
||||
|
@ -99,9 +104,11 @@ type CertExpirationMonitor struct {
|
|||
// then the metrics will expire before they are emitted again.
|
||||
Labels []metrics.Label
|
||||
Logger hclog.Logger
|
||||
// Query is called at each interval. It should return the duration until the
|
||||
// certificate expires, or an error if the query failed.
|
||||
Query func() (time.Duration, error)
|
||||
// Query is called at each interval. It should return 2 durations, the full
|
||||
// lifespan of the certificate (NotBefore -> NotAfter) and the duration
|
||||
// until the certificate expires (Now -> NotAfter), or an error if the
|
||||
// query failed.
|
||||
Query func() (time.Duration, time.Duration, error)
|
||||
}
|
||||
|
||||
const certExpirationMonitorInterval = time.Hour
|
||||
|
@ -113,18 +120,37 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
|
|||
logger := m.Logger.With("metric", strings.Join(m.Key, "."))
|
||||
|
||||
emitMetric := func() {
|
||||
d, err := m.Query()
|
||||
lifetime, untilAfter, err := m.Query()
|
||||
if err != nil {
|
||||
logger.Warn("failed to emit certificate expiry metric", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
if d < 24*time.Hour {
|
||||
logger.Warn("certificate will expire soon",
|
||||
"time_to_expiry", d, "expiration", time.Now().Add(d))
|
||||
if expiresSoon(lifetime, untilAfter) {
|
||||
key := strings.Join(m.Key, ":")
|
||||
switch key {
|
||||
case "mesh:active-root-ca:expiry":
|
||||
logger.Warn("root certificate will expire soon",
|
||||
"time_to_expiry", untilAfter,
|
||||
"expiration", time.Now().Add(untilAfter),
|
||||
"suggested_action", "manually rotate the root certificate",
|
||||
)
|
||||
case "mesh:active-signing-ca:expiry":
|
||||
logger.Warn("signing (intermediate) certificate will expire soon",
|
||||
"time_to_expiry", untilAfter,
|
||||
"expiration", time.Now().Add(untilAfter),
|
||||
"suggested_action", "check consul logs for rotation issues",
|
||||
)
|
||||
case "agent:tls:cert:expiry":
|
||||
logger.Warn("agent TLS certificate will expire soon",
|
||||
"time_to_expiry", untilAfter,
|
||||
"expiration", time.Now().Add(untilAfter),
|
||||
"suggested_action", "manually rotate this agent's certificate",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
expiry := d / time.Second
|
||||
expiry := untilAfter / time.Second
|
||||
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
|
||||
}
|
||||
|
||||
|
@ -153,3 +179,19 @@ func initLeaderMetrics() {
|
|||
metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels)
|
||||
}
|
||||
}
|
||||
|
||||
// expiresSoon checks to see if we are close enough to the cert expiring that
|
||||
// we should send out a WARN log message.
|
||||
// It returns true if the cert will expire within 28 days or 40% of the
|
||||
// certificate's total duration (whichever is shorter).
|
||||
func expiresSoon(lifetime, untilAfter time.Duration) bool {
|
||||
defaultPeriod := 28 * (24 * time.Hour) // 28 days
|
||||
fortyPercent := (lifetime / 10) * 4 // 40% of total duration
|
||||
|
||||
warningPeriod := defaultPeriod
|
||||
if fortyPercent < defaultPeriod {
|
||||
warningPeriod = fortyPercent
|
||||
}
|
||||
|
||||
return untilAfter < warningPeriod
|
||||
}
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
package consul
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
day = time.Hour * 24
|
||||
year = day * 365
|
||||
)
|
||||
|
||||
func TestExpiresSoon(t *testing.T) {
|
||||
// ExpiresSoon() should return true if 'untilAfter' is <= 28 days
|
||||
// OR if 40% of lifetime if it is less than 28 days
|
||||
testCases := []struct {
|
||||
name string
|
||||
lifetime, untilAfter time.Duration
|
||||
expiresSoon bool
|
||||
}{
|
||||
{name: "base-pass", lifetime: year, untilAfter: year, expiresSoon: false},
|
||||
{name: "base-expire", lifetime: year, untilAfter: (day * 27), expiresSoon: true},
|
||||
{name: "expires", lifetime: (day * 70), untilAfter: (day * 20), expiresSoon: true},
|
||||
{name: "passes", lifetime: (day * 70), untilAfter: (day * 50), expiresSoon: false},
|
||||
{name: "just-expires", lifetime: (day * 70), untilAfter: (day * 27), expiresSoon: true},
|
||||
{name: "just-passes", lifetime: (day * 70), untilAfter: (day * 43), expiresSoon: false},
|
||||
{name: "40%-expire", lifetime: (day * 30), untilAfter: (day * 10), expiresSoon: true},
|
||||
{name: "40%-pass", lifetime: (day * 30), untilAfter: (day * 12), expiresSoon: false},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if expiresSoon(tc.lifetime, tc.untilAfter) != tc.expiresSoon {
|
||||
t.Errorf("test case failed, should return `%t`", tc.expiresSoon)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
|
@ -30,17 +30,19 @@ func tlsCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger) cons
|
|||
return consul.CertExpirationMonitor{
|
||||
Key: metricsKeyAgentTLSCertExpiry,
|
||||
Logger: logger,
|
||||
Query: func() (time.Duration, error) {
|
||||
Query: func() (time.Duration, time.Duration, error) {
|
||||
raw := c.Cert()
|
||||
if raw == nil {
|
||||
return 0, fmt.Errorf("tls not enabled")
|
||||
return 0, 0, fmt.Errorf("tls not enabled")
|
||||
}
|
||||
|
||||
cert, err := x509.ParseCertificate(raw.Certificate[0])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
|
||||
return 0, 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
|
||||
}
|
||||
return time.Until(cert.NotAfter), nil
|
||||
|
||||
lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter)
|
||||
return lifetime, time.Until(cert.NotAfter), nil
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue