Merge pull request #10771 from hashicorp/dnephin/emit-telemetry-metrics-immediately

telemetry: improve cert expiry metrics
This commit is contained in:
Daniel Nephin 2021-11-01 18:31:03 -04:00 committed by GitHub
commit b57cae94de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 213 additions and 65 deletions

View File

@ -667,7 +667,7 @@ func (a *Agent) Start(ctx context.Context) error {
}
if a.tlsConfigurator.Cert() != nil {
m := consul.AgentTLSCertExpirationMonitor(a.tlsConfigurator, a.logger, a.config.Datacenter)
m := tlsCertExpirationMonitor(a.tlsConfigurator, a.logger)
go m.Monitor(&lib.StopChannelContext{StopCh: a.shutdownCh})
}

View File

@ -2,9 +2,9 @@ package consul
import (
"context"
"crypto/x509"
"errors"
"fmt"
"math"
"strings"
"time"
@ -15,13 +15,12 @@ import (
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/connect/ca"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/tlsutil"
)
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
var CertExpirationGauges = []prometheus.GaugeDefinition{
var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{
{
Name: metricsKeyMeshRootCAExpiry,
Help: "Seconds until the service mesh root certificate expires. Updated every hour",
@ -30,18 +29,11 @@ var CertExpirationGauges = []prometheus.GaugeDefinition{
Name: metricsKeyMeshActiveSigningCAExpiry,
Help: "Seconds until the service mesh signing certificate expires. Updated every hour",
},
{
Name: metricsKeyAgentTLSCertExpiry,
Help: "Seconds until the agent tls certificate expires. Updated every hour",
},
}
func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
return getRootCAExpiry(s)
@ -67,9 +59,6 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
if isPrimary {
return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
provider, _ := s.caManager.getCAProvider()
@ -84,9 +73,6 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
return getActiveIntermediateExpiry(s)
@ -97,8 +83,11 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
if err != nil {
return 0, err
switch {
case err != nil:
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil:
return 0, fmt.Errorf("no active root CA")
}
// the CA used in a secondary DC is the active intermediate,
@ -115,6 +104,10 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
type CertExpirationMonitor struct {
Key []string
// Labels to be emitted along with the metric. It is very important that these
// labels be included in the pre-declaration as well. Otherwise, if
// telemetry.prometheus_retention_time is less than certExpirationMonitorInterval
// then the metrics will expire before they are emitted again.
Labels []metrics.Label
Logger hclog.Logger
// Query is called at each interval. It should return the duration until the
@ -130,15 +123,11 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
logger := m.Logger.With("metric", strings.Join(m.Key, "."))
for {
select {
case <-ctx.Done():
return nil
case <-ticker.C:
emitMetric := func() {
d, err := m.Query()
if err != nil {
logger.Warn("failed to emit certificate expiry metric", "error", err)
continue
return
}
if d < 24*time.Hour {
@ -149,32 +138,29 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
expiry := d / time.Second
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
}
// emit the metric immediately so that if a cert was just updated the
// new metric will be updated to the new expiration time.
emitMetric()
for {
select {
case <-ctx.Done():
// "Zero-out" the metric on exit so that when prometheus scrapes this
// metric from a non-leader, it does not get a stale value.
metrics.SetGaugeWithLabels(m.Key, float32(math.NaN()), m.Labels)
return nil
case <-ticker.C:
emitMetric()
}
}
}
var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}
// AgentTLSCertExpirationMonitor returns a CertExpirationMonitor which will
// monitor the expiration of the certificate used for agent TLS.
func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry,
Labels: []metrics.Label{
{Name: "node", Value: c.Base().NodeName},
{Name: "datacenter", Value: dc},
},
Logger: logger,
Query: func() (time.Duration, error) {
raw := c.Cert()
if raw == nil {
return 0, fmt.Errorf("tls not enabled")
}
cert, err := x509.ParseCertificate(raw.Certificate[0])
if err != nil {
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
}
return time.Until(cert.NotAfter), nil
},
// initLeaderMetrics sets all metrics that are emitted only on leaders to a NaN
// value so that they don't incorrectly report 0 when a server starts as a
// follower.
func initLeaderMetrics() {
for _, g := range LeaderCertExpirationGauges {
metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels)
}
}

View File

@ -389,6 +389,8 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
return nil, err
}
initLeaderMetrics()
s.rpcLimiter.Store(rate.NewLimiter(config.RPCRateLimit, config.RPCMaxBurst))
configReplicatorConfig := ReplicatorConfig{

43
agent/metrics.go Normal file
View File

@ -0,0 +1,43 @@
package agent
import (
"crypto/x509"
"fmt"
"time"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/tlsutil"
)
var CertExpirationGauges = []prometheus.GaugeDefinition{
{
Name: metricsKeyAgentTLSCertExpiry,
Help: "Seconds until the agent tls certificate expires. Updated every hour",
},
}
var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}
// tlsCertExpirationMonitor returns a CertExpirationMonitor which will
// monitor the expiration of the certificate used for agent TLS.
func tlsCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger) consul.CertExpirationMonitor {
return consul.CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry,
Logger: logger,
Query: func() (time.Duration, error) {
raw := c.Cert()
if raw == nil {
return 0, fmt.Errorf("tls not enabled")
}
cert, err := x509.ParseCertificate(raw.Certificate[0])
if err != nil {
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
}
return time.Until(cert.NotAfter), nil
},
}
}

View File

@ -1,20 +1,30 @@
package agent
import (
"github.com/stretchr/testify/require"
"crypto/x509"
"fmt"
"io/ioutil"
"net/http"
"net/http/httptest"
"path/filepath"
"strings"
"testing"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/testrpc"
"github.com/hashicorp/consul/tlsutil"
"github.com/stretchr/testify/require"
)
func checkForShortTesting(t *testing.T) {
func skipIfShortTesting(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
}
func recordPromMetrics(t *testing.T, a *TestAgent, respRec *httptest.ResponseRecorder) {
t.Helper()
req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil)
require.NoError(t, err, "Failed to generate new http request.")
@ -49,7 +59,7 @@ func assertMetricNotExists(t *testing.T, respRec *httptest.ResponseRecorder, met
// TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus adds testing around
// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot
func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) {
checkForShortTesting(t)
skipIfShortTesting(t)
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
t.Run("Check consul_autopilot_* are not emitted metrics on clients", func(t *testing.T) {
@ -95,3 +105,108 @@ func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) {
assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "NaN")
})
}
func TestHTTPHandlers_AgentMetrics_TLSCertExpiry_Prometheus(t *testing.T) {
skipIfShortTesting(t)
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
dir := testutil.TempDir(t, "ca")
caPEM, caPK, err := tlsutil.GenerateCA(tlsutil.CAOpts{Days: 20, Domain: "consul"})
require.NoError(t, err)
caPath := filepath.Join(dir, "ca.pem")
err = ioutil.WriteFile(caPath, []byte(caPEM), 0600)
require.NoError(t, err)
signer, err := tlsutil.ParseSigner(caPK)
require.NoError(t, err)
pem, key, err := tlsutil.GenerateCert(tlsutil.CertOpts{
Signer: signer,
CA: caPEM,
Name: "server.dc1.consul",
Days: 20,
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
})
require.NoError(t, err)
certPath := filepath.Join(dir, "cert.pem")
err = ioutil.WriteFile(certPath, []byte(pem), 0600)
require.NoError(t, err)
keyPath := filepath.Join(dir, "cert.key")
err = ioutil.WriteFile(keyPath, []byte(key), 0600)
require.NoError(t, err)
hcl := fmt.Sprintf(`
telemetry = {
prometheus_retention_time = "5s",
disable_hostname = true
metrics_prefix = "agent_3"
}
ca_file = "%s"
cert_file = "%s"
key_file = "%s"
`, caPath, certPath, keyPath)
a := StartTestAgent(t, TestAgent{HCL: hcl})
defer a.Shutdown()
respRec := httptest.NewRecorder()
recordPromMetrics(t, a, respRec)
require.Contains(t, respRec.Body.String(), "agent_3_agent_tls_cert_expiry 1.7")
}
func TestHTTPHandlers_AgentMetrics_CACertExpiry_Prometheus(t *testing.T) {
skipIfShortTesting(t)
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
t.Run("non-leader emits NaN", func(t *testing.T) {
hcl := `
telemetry = {
prometheus_retention_time = "5s",
disable_hostname = true
metrics_prefix = "agent_4"
}
connect {
enabled = true
}
bootstrap = false
`
a := StartTestAgent(t, TestAgent{HCL: hcl})
defer a.Shutdown()
respRec := httptest.NewRecorder()
recordPromMetrics(t, a, respRec)
require.Contains(t, respRec.Body.String(), "agent_4_mesh_active_root_ca_expiry NaN")
require.Contains(t, respRec.Body.String(), "agent_4_mesh_active_signing_ca_expiry NaN")
})
t.Run("leader emits a value", func(t *testing.T) {
hcl := `
telemetry = {
prometheus_retention_time = "5s",
disable_hostname = true
metrics_prefix = "agent_5"
}
connect {
enabled = true
}
`
a := StartTestAgent(t, TestAgent{HCL: hcl})
defer a.Shutdown()
testrpc.WaitForLeader(t, a.RPC, "dc1")
respRec := httptest.NewRecorder()
recordPromMetrics(t, a, respRec)
out := respRec.Body.String()
require.Contains(t, out, "agent_5_mesh_active_root_ca_expiry 3.15")
require.Contains(t, out, "agent_5_mesh_active_signing_ca_expiry 3.15")
})
}

View File

@ -211,14 +211,16 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau
xds.StatsGauges,
usagemetrics.Gauges,
consul.ReplicationGauges,
consul.CertExpirationGauges,
CertExpirationGauges,
Gauges,
raftGauges,
}
// TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc
if isServer {
gauges = append(gauges, consul.AutopilotGauges)
gauges = append(gauges,
consul.AutopilotGauges,
consul.LeaderCertExpirationGauges)
}
// Flatten definitions