mirror of https://github.com/status-im/consul.git
Merge pull request #10771 from hashicorp/dnephin/emit-telemetry-metrics-immediately
telemetry: improve cert expiry metrics
This commit is contained in:
commit
b57cae94de
|
@ -667,7 +667,7 @@ func (a *Agent) Start(ctx context.Context) error {
|
|||
}
|
||||
|
||||
if a.tlsConfigurator.Cert() != nil {
|
||||
m := consul.AgentTLSCertExpirationMonitor(a.tlsConfigurator, a.logger, a.config.Datacenter)
|
||||
m := tlsCertExpirationMonitor(a.tlsConfigurator, a.logger)
|
||||
go m.Monitor(&lib.StopChannelContext{StopCh: a.shutdownCh})
|
||||
}
|
||||
|
||||
|
|
|
@ -2,9 +2,9 @@ package consul
|
|||
|
||||
import (
|
||||
"context"
|
||||
"crypto/x509"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
@ -15,13 +15,12 @@ import (
|
|||
"github.com/hashicorp/consul/agent/connect"
|
||||
"github.com/hashicorp/consul/agent/connect/ca"
|
||||
"github.com/hashicorp/consul/logging"
|
||||
"github.com/hashicorp/consul/tlsutil"
|
||||
)
|
||||
|
||||
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
|
||||
var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
|
||||
|
||||
var CertExpirationGauges = []prometheus.GaugeDefinition{
|
||||
var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{
|
||||
{
|
||||
Name: metricsKeyMeshRootCAExpiry,
|
||||
Help: "Seconds until the service mesh root certificate expires. Updated every hour",
|
||||
|
@ -30,18 +29,11 @@ var CertExpirationGauges = []prometheus.GaugeDefinition{
|
|||
Name: metricsKeyMeshActiveSigningCAExpiry,
|
||||
Help: "Seconds until the service mesh signing certificate expires. Updated every hour",
|
||||
},
|
||||
{
|
||||
Name: metricsKeyAgentTLSCertExpiry,
|
||||
Help: "Seconds until the agent tls certificate expires. Updated every hour",
|
||||
},
|
||||
}
|
||||
|
||||
func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
|
||||
return CertExpirationMonitor{
|
||||
Key: metricsKeyMeshRootCAExpiry,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: s.config.Datacenter},
|
||||
},
|
||||
Key: metricsKeyMeshRootCAExpiry,
|
||||
Logger: s.logger.Named(logging.Connect),
|
||||
Query: func() (time.Duration, error) {
|
||||
return getRootCAExpiry(s)
|
||||
|
@ -66,10 +58,7 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
|
|||
isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter
|
||||
if isPrimary {
|
||||
return CertExpirationMonitor{
|
||||
Key: metricsKeyMeshActiveSigningCAExpiry,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: s.config.Datacenter},
|
||||
},
|
||||
Key: metricsKeyMeshActiveSigningCAExpiry,
|
||||
Logger: s.logger.Named(logging.Connect),
|
||||
Query: func() (time.Duration, error) {
|
||||
provider, _ := s.caManager.getCAProvider()
|
||||
|
@ -83,10 +72,7 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
|
|||
}
|
||||
|
||||
return CertExpirationMonitor{
|
||||
Key: metricsKeyMeshActiveSigningCAExpiry,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: s.config.Datacenter},
|
||||
},
|
||||
Key: metricsKeyMeshActiveSigningCAExpiry,
|
||||
Logger: s.logger.Named(logging.Connect),
|
||||
Query: func() (time.Duration, error) {
|
||||
return getActiveIntermediateExpiry(s)
|
||||
|
@ -97,8 +83,11 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
|
|||
func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
|
||||
state := s.fsm.State()
|
||||
_, root, err := state.CARootActive(nil)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
switch {
|
||||
case err != nil:
|
||||
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
||||
case root == nil:
|
||||
return 0, fmt.Errorf("no active root CA")
|
||||
}
|
||||
|
||||
// the CA used in a secondary DC is the active intermediate,
|
||||
|
@ -114,7 +103,11 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
|
|||
}
|
||||
|
||||
type CertExpirationMonitor struct {
|
||||
Key []string
|
||||
Key []string
|
||||
// Labels to be emitted along with the metric. It is very important that these
|
||||
// labels be included in the pre-declaration as well. Otherwise, if
|
||||
// telemetry.prometheus_retention_time is less than certExpirationMonitorInterval
|
||||
// then the metrics will expire before they are emitted again.
|
||||
Labels []metrics.Label
|
||||
Logger hclog.Logger
|
||||
// Query is called at each interval. It should return the duration until the
|
||||
|
@ -130,51 +123,44 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
|
|||
|
||||
logger := m.Logger.With("metric", strings.Join(m.Key, "."))
|
||||
|
||||
emitMetric := func() {
|
||||
d, err := m.Query()
|
||||
if err != nil {
|
||||
logger.Warn("failed to emit certificate expiry metric", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
if d < 24*time.Hour {
|
||||
logger.Warn("certificate will expire soon",
|
||||
"time_to_expiry", d, "expiration", time.Now().Add(d))
|
||||
}
|
||||
|
||||
expiry := d / time.Second
|
||||
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
|
||||
}
|
||||
|
||||
// emit the metric immediately so that if a cert was just updated the
|
||||
// new metric will be updated to the new expiration time.
|
||||
emitMetric()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// "Zero-out" the metric on exit so that when prometheus scrapes this
|
||||
// metric from a non-leader, it does not get a stale value.
|
||||
metrics.SetGaugeWithLabels(m.Key, float32(math.NaN()), m.Labels)
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
d, err := m.Query()
|
||||
if err != nil {
|
||||
logger.Warn("failed to emit certificate expiry metric", "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if d < 24*time.Hour {
|
||||
logger.Warn("certificate will expire soon",
|
||||
"time_to_expiry", d, "expiration", time.Now().Add(d))
|
||||
}
|
||||
|
||||
expiry := d / time.Second
|
||||
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
|
||||
emitMetric()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}
|
||||
|
||||
// AgentTLSCertExpirationMonitor returns a CertExpirationMonitor which will
|
||||
// monitor the expiration of the certificate used for agent TLS.
|
||||
func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor {
|
||||
return CertExpirationMonitor{
|
||||
Key: metricsKeyAgentTLSCertExpiry,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "node", Value: c.Base().NodeName},
|
||||
{Name: "datacenter", Value: dc},
|
||||
},
|
||||
Logger: logger,
|
||||
Query: func() (time.Duration, error) {
|
||||
raw := c.Cert()
|
||||
if raw == nil {
|
||||
return 0, fmt.Errorf("tls not enabled")
|
||||
}
|
||||
|
||||
cert, err := x509.ParseCertificate(raw.Certificate[0])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
|
||||
}
|
||||
return time.Until(cert.NotAfter), nil
|
||||
},
|
||||
// initLeaderMetrics sets all metrics that are emitted only on leaders to a NaN
|
||||
// value so that they don't incorrectly report 0 when a server starts as a
|
||||
// follower.
|
||||
func initLeaderMetrics() {
|
||||
for _, g := range LeaderCertExpirationGauges {
|
||||
metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -389,6 +389,8 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
initLeaderMetrics()
|
||||
|
||||
s.rpcLimiter.Store(rate.NewLimiter(config.RPCRateLimit, config.RPCMaxBurst))
|
||||
|
||||
configReplicatorConfig := ReplicatorConfig{
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
package agent
|
||||
|
||||
import (
|
||||
"crypto/x509"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics/prometheus"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
|
||||
"github.com/hashicorp/consul/agent/consul"
|
||||
"github.com/hashicorp/consul/tlsutil"
|
||||
)
|
||||
|
||||
var CertExpirationGauges = []prometheus.GaugeDefinition{
|
||||
{
|
||||
Name: metricsKeyAgentTLSCertExpiry,
|
||||
Help: "Seconds until the agent tls certificate expires. Updated every hour",
|
||||
},
|
||||
}
|
||||
|
||||
var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}
|
||||
|
||||
// tlsCertExpirationMonitor returns a CertExpirationMonitor which will
|
||||
// monitor the expiration of the certificate used for agent TLS.
|
||||
func tlsCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger) consul.CertExpirationMonitor {
|
||||
return consul.CertExpirationMonitor{
|
||||
Key: metricsKeyAgentTLSCertExpiry,
|
||||
Logger: logger,
|
||||
Query: func() (time.Duration, error) {
|
||||
raw := c.Cert()
|
||||
if raw == nil {
|
||||
return 0, fmt.Errorf("tls not enabled")
|
||||
}
|
||||
|
||||
cert, err := x509.ParseCertificate(raw.Certificate[0])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
|
||||
}
|
||||
return time.Until(cert.NotAfter), nil
|
||||
},
|
||||
}
|
||||
}
|
|
@ -1,20 +1,30 @@
|
|||
package agent
|
||||
|
||||
import (
|
||||
"github.com/stretchr/testify/require"
|
||||
"crypto/x509"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/consul/sdk/testutil"
|
||||
"github.com/hashicorp/consul/testrpc"
|
||||
"github.com/hashicorp/consul/tlsutil"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func checkForShortTesting(t *testing.T) {
|
||||
func skipIfShortTesting(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("too slow for testing.Short")
|
||||
}
|
||||
}
|
||||
|
||||
func recordPromMetrics(t *testing.T, a *TestAgent, respRec *httptest.ResponseRecorder) {
|
||||
t.Helper()
|
||||
req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil)
|
||||
require.NoError(t, err, "Failed to generate new http request.")
|
||||
|
||||
|
@ -49,7 +59,7 @@ func assertMetricNotExists(t *testing.T, respRec *httptest.ResponseRecorder, met
|
|||
// TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus adds testing around
|
||||
// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot
|
||||
func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) {
|
||||
checkForShortTesting(t)
|
||||
skipIfShortTesting(t)
|
||||
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
|
||||
|
||||
t.Run("Check consul_autopilot_* are not emitted metrics on clients", func(t *testing.T) {
|
||||
|
@ -95,3 +105,108 @@ func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) {
|
|||
assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "NaN")
|
||||
})
|
||||
}
|
||||
|
||||
func TestHTTPHandlers_AgentMetrics_TLSCertExpiry_Prometheus(t *testing.T) {
|
||||
skipIfShortTesting(t)
|
||||
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
|
||||
|
||||
dir := testutil.TempDir(t, "ca")
|
||||
caPEM, caPK, err := tlsutil.GenerateCA(tlsutil.CAOpts{Days: 20, Domain: "consul"})
|
||||
require.NoError(t, err)
|
||||
|
||||
caPath := filepath.Join(dir, "ca.pem")
|
||||
err = ioutil.WriteFile(caPath, []byte(caPEM), 0600)
|
||||
require.NoError(t, err)
|
||||
|
||||
signer, err := tlsutil.ParseSigner(caPK)
|
||||
require.NoError(t, err)
|
||||
|
||||
pem, key, err := tlsutil.GenerateCert(tlsutil.CertOpts{
|
||||
Signer: signer,
|
||||
CA: caPEM,
|
||||
Name: "server.dc1.consul",
|
||||
Days: 20,
|
||||
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
certPath := filepath.Join(dir, "cert.pem")
|
||||
err = ioutil.WriteFile(certPath, []byte(pem), 0600)
|
||||
require.NoError(t, err)
|
||||
|
||||
keyPath := filepath.Join(dir, "cert.key")
|
||||
err = ioutil.WriteFile(keyPath, []byte(key), 0600)
|
||||
require.NoError(t, err)
|
||||
|
||||
hcl := fmt.Sprintf(`
|
||||
telemetry = {
|
||||
prometheus_retention_time = "5s",
|
||||
disable_hostname = true
|
||||
metrics_prefix = "agent_3"
|
||||
}
|
||||
ca_file = "%s"
|
||||
cert_file = "%s"
|
||||
key_file = "%s"
|
||||
`, caPath, certPath, keyPath)
|
||||
|
||||
a := StartTestAgent(t, TestAgent{HCL: hcl})
|
||||
defer a.Shutdown()
|
||||
|
||||
respRec := httptest.NewRecorder()
|
||||
recordPromMetrics(t, a, respRec)
|
||||
|
||||
require.Contains(t, respRec.Body.String(), "agent_3_agent_tls_cert_expiry 1.7")
|
||||
}
|
||||
|
||||
func TestHTTPHandlers_AgentMetrics_CACertExpiry_Prometheus(t *testing.T) {
|
||||
skipIfShortTesting(t)
|
||||
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
|
||||
|
||||
t.Run("non-leader emits NaN", func(t *testing.T) {
|
||||
hcl := `
|
||||
telemetry = {
|
||||
prometheus_retention_time = "5s",
|
||||
disable_hostname = true
|
||||
metrics_prefix = "agent_4"
|
||||
}
|
||||
connect {
|
||||
enabled = true
|
||||
}
|
||||
bootstrap = false
|
||||
`
|
||||
|
||||
a := StartTestAgent(t, TestAgent{HCL: hcl})
|
||||
defer a.Shutdown()
|
||||
|
||||
respRec := httptest.NewRecorder()
|
||||
recordPromMetrics(t, a, respRec)
|
||||
|
||||
require.Contains(t, respRec.Body.String(), "agent_4_mesh_active_root_ca_expiry NaN")
|
||||
require.Contains(t, respRec.Body.String(), "agent_4_mesh_active_signing_ca_expiry NaN")
|
||||
})
|
||||
|
||||
t.Run("leader emits a value", func(t *testing.T) {
|
||||
hcl := `
|
||||
telemetry = {
|
||||
prometheus_retention_time = "5s",
|
||||
disable_hostname = true
|
||||
metrics_prefix = "agent_5"
|
||||
}
|
||||
connect {
|
||||
enabled = true
|
||||
}
|
||||
`
|
||||
|
||||
a := StartTestAgent(t, TestAgent{HCL: hcl})
|
||||
defer a.Shutdown()
|
||||
testrpc.WaitForLeader(t, a.RPC, "dc1")
|
||||
|
||||
respRec := httptest.NewRecorder()
|
||||
recordPromMetrics(t, a, respRec)
|
||||
|
||||
out := respRec.Body.String()
|
||||
require.Contains(t, out, "agent_5_mesh_active_root_ca_expiry 3.15")
|
||||
require.Contains(t, out, "agent_5_mesh_active_signing_ca_expiry 3.15")
|
||||
})
|
||||
|
||||
}
|
||||
|
|
|
@ -211,14 +211,16 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau
|
|||
xds.StatsGauges,
|
||||
usagemetrics.Gauges,
|
||||
consul.ReplicationGauges,
|
||||
consul.CertExpirationGauges,
|
||||
CertExpirationGauges,
|
||||
Gauges,
|
||||
raftGauges,
|
||||
}
|
||||
|
||||
// TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc
|
||||
if isServer {
|
||||
gauges = append(gauges, consul.AutopilotGauges)
|
||||
gauges = append(gauges,
|
||||
consul.AutopilotGauges,
|
||||
consul.LeaderCertExpirationGauges)
|
||||
}
|
||||
|
||||
// Flatten definitions
|
||||
|
|
Loading…
Reference in New Issue