agent: move agent tls metric monitor to a more appropriate place

And add a test for it
This commit is contained in:
Daniel Nephin 2021-10-27 15:23:29 -04:00
parent c92513ec16
commit a8e2e1c365
5 changed files with 109 additions and 37 deletions

View File

@ -667,7 +667,7 @@ func (a *Agent) Start(ctx context.Context) error {
} }
if a.tlsConfigurator.Cert() != nil { if a.tlsConfigurator.Cert() != nil {
m := consul.AgentTLSCertExpirationMonitor(a.tlsConfigurator, a.logger) m := tlsCertExpirationMonitor(a.tlsConfigurator, a.logger)
go m.Monitor(&lib.StopChannelContext{StopCh: a.shutdownCh}) go m.Monitor(&lib.StopChannelContext{StopCh: a.shutdownCh})
} }

View File

@ -2,7 +2,6 @@ package consul
import ( import (
"context" "context"
"crypto/x509"
"errors" "errors"
"fmt" "fmt"
"math" "math"
@ -16,7 +15,6 @@ import (
"github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/connect/ca" "github.com/hashicorp/consul/agent/connect/ca"
"github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/tlsutil"
) )
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
@ -33,13 +31,6 @@ var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{
}, },
} }
var AgentCertExpirationGauges = []prometheus.GaugeDefinition{
{
Name: metricsKeyAgentTLSCertExpiry,
Help: "Seconds until the agent tls certificate expires. Updated every hour",
},
}
func rootCAExpiryMonitor(s *Server) CertExpirationMonitor { func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{ return CertExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry, Key: metricsKeyMeshRootCAExpiry,
@ -165,29 +156,6 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
} }
} }
var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}
// AgentTLSCertExpirationMonitor returns a CertExpirationMonitor which will
// monitor the expiration of the certificate used for agent TLS.
func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry,
Logger: logger,
Query: func() (time.Duration, error) {
raw := c.Cert()
if raw == nil {
return 0, fmt.Errorf("tls not enabled")
}
cert, err := x509.ParseCertificate(raw.Certificate[0])
if err != nil {
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
}
return time.Until(cert.NotAfter), nil
},
}
}
// initLeaderMetrics sets all metrics that are emitted only on leaders to a NaN // initLeaderMetrics sets all metrics that are emitted only on leaders to a NaN
// value so that they don't incorrectly report 0 when a server starts as a // value so that they don't incorrectly report 0 when a server starts as a
// follower. // follower.

43
agent/metrics.go Normal file
View File

@ -0,0 +1,43 @@
package agent
import (
"crypto/x509"
"fmt"
"time"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/tlsutil"
)
var CertExpirationGauges = []prometheus.GaugeDefinition{
{
Name: metricsKeyAgentTLSCertExpiry,
Help: "Seconds until the agent tls certificate expires. Updated every hour",
},
}
var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}
// tlsCertExpirationMonitor returns a CertExpirationMonitor which will
// monitor the expiration of the certificate used for agent TLS.
func tlsCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger) consul.CertExpirationMonitor {
return consul.CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry,
Logger: logger,
Query: func() (time.Duration, error) {
raw := c.Cert()
if raw == nil {
return 0, fmt.Errorf("tls not enabled")
}
cert, err := x509.ParseCertificate(raw.Certificate[0])
if err != nil {
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
}
return time.Until(cert.NotAfter), nil
},
}
}

View File

@ -1,20 +1,29 @@
package agent package agent
import ( import (
"github.com/stretchr/testify/require" "crypto/x509"
"fmt"
"io/ioutil"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"path/filepath"
"strings" "strings"
"testing" "testing"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/tlsutil"
"github.com/stretchr/testify/require"
) )
func checkForShortTesting(t *testing.T) { func skipIfShortTesting(t *testing.T) {
if testing.Short() { if testing.Short() {
t.Skip("too slow for testing.Short") t.Skip("too slow for testing.Short")
} }
} }
func recordPromMetrics(t *testing.T, a *TestAgent, respRec *httptest.ResponseRecorder) { func recordPromMetrics(t *testing.T, a *TestAgent, respRec *httptest.ResponseRecorder) {
t.Helper()
req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil) req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil)
require.NoError(t, err, "Failed to generate new http request.") require.NoError(t, err, "Failed to generate new http request.")
@ -49,7 +58,7 @@ func assertMetricNotExists(t *testing.T, respRec *httptest.ResponseRecorder, met
// TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus adds testing around // TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus adds testing around
// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot // the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot
func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) { func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) {
checkForShortTesting(t) skipIfShortTesting(t)
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance // This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
t.Run("Check consul_autopilot_* are not emitted metrics on clients", func(t *testing.T) { t.Run("Check consul_autopilot_* are not emitted metrics on clients", func(t *testing.T) {
@ -95,3 +104,55 @@ func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) {
assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "NaN") assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "NaN")
}) })
} }
func TestHTTPHandlers_AgentMetrics_TLSCertExpiry_Prometheus(t *testing.T) {
skipIfShortTesting(t)
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
dir := testutil.TempDir(t, "ca")
caPEM, caPK, err := tlsutil.GenerateCA(tlsutil.CAOpts{Days: 20, Domain: "consul"})
require.NoError(t, err)
caPath := filepath.Join(dir, "ca.pem")
err = ioutil.WriteFile(caPath, []byte(caPEM), 0600)
require.NoError(t, err)
signer, err := tlsutil.ParseSigner(caPK)
require.NoError(t, err)
pem, key, err := tlsutil.GenerateCert(tlsutil.CertOpts{
Signer: signer,
CA: caPEM,
Name: "server.dc1.consul",
Days: 20,
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
})
require.NoError(t, err)
certPath := filepath.Join(dir, "cert.pem")
err = ioutil.WriteFile(certPath, []byte(pem), 0600)
require.NoError(t, err)
keyPath := filepath.Join(dir, "cert.key")
err = ioutil.WriteFile(keyPath, []byte(key), 0600)
require.NoError(t, err)
hcl := fmt.Sprintf(`
telemetry = {
prometheus_retention_time = "5s",
disable_hostname = true
metrics_prefix = "agent_3"
}
ca_file = "%s"
cert_file = "%s"
key_file = "%s"
`, caPath, certPath, keyPath)
a := StartTestAgent(t, TestAgent{HCL: hcl})
defer a.Shutdown()
respRec := httptest.NewRecorder()
recordPromMetrics(t, a, respRec)
require.Contains(t, respRec.Body.String(), "agent_3_agent_tls_cert_expiry 1.7")
}

View File

@ -211,7 +211,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau
xds.StatsGauges, xds.StatsGauges,
usagemetrics.Gauges, usagemetrics.Gauges,
consul.ReplicationGauges, consul.ReplicationGauges,
consul.AgentCertExpirationGauges, CertExpirationGauges,
Gauges, Gauges,
raftGauges, raftGauges,
} }