From 88b013be99d65aad46c13372c5548f21d4ee941a Mon Sep 17 00:00:00 2001 From: Kit Patella Date: Mon, 16 Nov 2020 15:54:50 -0800 Subject: [PATCH] Merge pull request #9198 from hashicorp/mkcp/telemetry/add-all-metric-definitions Add metric definitions for all metrics known at Consul start --- .changelog/9198.txt | 3 + agent/agent_endpoint.go | 2 +- agent/cache/cache.go | 31 ++++++ agent/catalog_endpoint.go | 118 +++++++++++++++++++- agent/config/builder.go | 5 +- agent/config/runtime_test.go | 15 ++- agent/consul/acl.go | 29 ++++- agent/consul/acl_endpoint.go | 70 +++++++++++- agent/consul/acl_endpoint_legacy.go | 8 ++ agent/consul/autopilot.go | 12 +++ agent/consul/catalog_endpoint.go | 47 ++++++++ agent/consul/client.go | 16 +++ agent/consul/config_endpoint.go | 29 +++++ agent/consul/federation_state_endpoint.go | 22 +++- agent/consul/fsm/commands_oss.go | 93 +++++++++++++++- agent/consul/fsm/snapshot.go | 8 ++ agent/consul/intention_endpoint.go | 12 +++ agent/consul/kvs_endpoint.go | 8 ++ agent/consul/leader.go | 16 +++ agent/consul/prepared_query_endpoint.go | 20 ++++ agent/consul/rpc.go | 42 ++++++++ agent/consul/segment_oss.go | 8 ++ agent/consul/server.go | 4 +- agent/consul/session_endpoint.go | 12 +++ agent/consul/session_ttl.go | 23 ++++ agent/consul/txn_endpoint.go | 12 +++ agent/consul/usagemetrics/usagemetrics.go | 17 +++ agent/dns.go | 20 ++++ agent/grpc/stats.go | 38 +++++++ agent/http.go | 8 ++ agent/local/state.go | 31 +++++- agent/setup.go | 126 ++++++++++++++++++++++ connect/proxy/proxy.go | 2 + lib/telemetry.go | 99 ++++------------- lib/telemetry_test.go | 9 ++ 35 files changed, 922 insertions(+), 93 deletions(-) create mode 100644 .changelog/9198.txt diff --git a/.changelog/9198.txt b/.changelog/9198.txt new file mode 100644 index 0000000000..3f68c3b4b8 --- /dev/null +++ b/.changelog/9198.txt @@ -0,0 +1,3 @@ +```release-note:improvement +agent: All metrics should be present and available to prometheus scrapers when Consul starts. If any non-deprecated metrics are missing please submit an issue with its name. +``` diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index 73e0f53640..49721f9125 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -136,7 +136,7 @@ func (s *HTTPHandlers) AgentMetrics(resp http.ResponseWriter, req *http.Request) return nil, acl.ErrPermissionDenied } if enablePrometheusOutput(req) { - if s.agent.config.Telemetry.PrometheusRetentionTime < 1 { + if s.agent.config.Telemetry.PrometheusOpts.Expiration < 1 { resp.WriteHeader(http.StatusUnsupportedMediaType) fmt.Fprint(resp, "Prometheus is not enabled since its retention time is not positive") return nil, nil diff --git a/agent/cache/cache.go b/agent/cache/cache.go index 1a5193792b..62dc8619ba 100644 --- a/agent/cache/cache.go +++ b/agent/cache/cache.go @@ -24,6 +24,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "golang.org/x/time/rate" "github.com/hashicorp/consul/lib" @@ -32,6 +33,34 @@ import ( //go:generate mockery -all -inpkg +// TODO(kit): remove the namespace from these once the metrics themselves change +var Gauges = []prometheus.GaugeDefinition{ + { + Name: []string{"consul", "cache", "entries_count"}, + Help: "", + }, +} + +// TODO(kit): remove the namespace from these once the metrics themselves change +var Counters = []prometheus.CounterDefinition{ + { + Name: []string{"consul", "cache", "bypass"}, + Help: "", + }, + { + Name: []string{"consul", "cache", "fetch_success"}, + Help: "", + }, + { + Name: []string{"consul", "cache", "fetch_error"}, + Help: "", + }, + { + Name: []string{"consul", "cache", "evict_expired"}, + Help: "", + }, +} + // Constants related to refresh backoff. We probably don't ever need to // make these configurable knobs since they primarily exist to lower load. const ( @@ -629,6 +658,7 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign // Error handling if err == nil { labels := []metrics.Label{{Name: "result_not_modified", Value: strconv.FormatBool(result.NotModified)}} + // TODO(kit): move tEntry.Name to a label on the first write here and deprecate the second write metrics.IncrCounterWithLabels([]string{"consul", "cache", "fetch_success"}, 1, labels) metrics.IncrCounterWithLabels([]string{"consul", "cache", tEntry.Name, "fetch_success"}, 1, labels) @@ -658,6 +688,7 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign newEntry.RefreshLostContact = time.Time{} } } else { + // TODO(kit): Add tEntry.Name to label on fetch_error and deprecate second write metrics.IncrCounter([]string{"consul", "cache", "fetch_error"}, 1) metrics.IncrCounter([]string{"consul", "cache", tEntry.Name, "fetch_error"}, 1) diff --git a/agent/catalog_endpoint.go b/agent/catalog_endpoint.go index 60c5fc3449..188c1bfb20 100644 --- a/agent/catalog_endpoint.go +++ b/agent/catalog_endpoint.go @@ -5,11 +5,127 @@ import ( "net/http" "strings" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/structs" ) +var CatalogCounters = []prometheus.CounterDefinition{ + { + Name: []string{"client", "api", "catalog_register"}, + Help: "Increments whenever a Consul agent receives a catalog register request.", + }, + { + Name: []string{"client", "rpc", "error", "catalog_register"}, + Help: "Increments whenever a Consul agent receives an RPC error for a catalog register request.", + }, + { + Name: []string{"client", "api", "success", "catalog_register"}, + Help: "Increments whenever a Consul agent successfully responds to a catalog register request.", + }, + { + Name: []string{"client", "api", "catalog_deregister"}, + Help: "Increments whenever a Consul agent receives a catalog deregister request.", + }, + { + Name: []string{"client", "api", "catalog_datacenters"}, + Help: "Increments whenever a Consul agent receives a request to list datacenters in the catalog.", + }, + { + Name: []string{"client", "rpc", "error", "catalog_deregister"}, + Help: "Increments whenever a Consul agent receives an RPC error for a catalog deregister request.", + }, + { + Name: []string{"client", "api", "success", "catalog_nodes"}, + Help: "Increments whenever a Consul agent successfully responds to a request to list nodes.", + }, + { + Name: []string{"client", "rpc", "error", "catalog_nodes"}, + Help: "Increments whenever a Consul agent receives an RPC error for a request to list nodes.", + }, + { + Name: []string{"client", "api", "success", "catalog_deregister"}, + Help: "Increments whenever a Consul agent successfully responds to a catalog deregister request.", + }, + { + Name: []string{"client", "rpc", "error", "catalog_datacenters"}, + Help: "Increments whenever a Consul agent receives an RPC error for a request to list datacenters.", + }, + { + Name: []string{"client", "api", "success", "catalog_datacenters"}, + Help: "Increments whenever a Consul agent successfully responds to a request to list datacenters.", + }, + { + Name: []string{"client", "api", "catalog_nodes"}, + Help: "Increments whenever a Consul agent receives a request to list nodes from the catalog.", + }, + { + Name: []string{"client", "api", "catalog_services"}, + Help: "Increments whenever a Consul agent receives a request to list services from the catalog.", + }, + { + Name: []string{"client", "rpc", "error", "catalog_services"}, + Help: "Increments whenever a Consul agent receives an RPC error for a request to list services.", + }, + { + Name: []string{"client", "api", "success", "catalog_services"}, + Help: "Increments whenever a Consul agent successfully responds to a request to list services.", + }, + { + Name: []string{"client", "api", "catalog_service_nodes"}, + Help: "Increments whenever a Consul agent receives a request to list nodes offering a service.", + }, + { + Name: []string{"client", "rpc", "error", "catalog_service_nodes"}, + Help: "Increments whenever a Consul agent receives an RPC error for a request to list nodes offering a service.", + }, + { + Name: []string{"client", "api", "success", "catalog_service_nodes"}, + Help: "Increments whenever a Consul agent successfully responds to a request to list nodes offering a service.", + }, + { + Name: []string{"client", "api", "error", "catalog_service_nodes"}, + Help: "", + }, + { + Name: []string{"client", "api", "catalog_node_services"}, + Help: "Increments whenever a Consul agent successfully responds to a request to list nodes offering a service.", + }, + { + Name: []string{"client", "api", "success", "catalog_node_services"}, + Help: "Increments whenever a Consul agent successfully responds to a request to list services in a node.", + }, + { + Name: []string{"client", "rpc", "error", "catalog_node_services"}, + Help: "Increments whenever a Consul agent receives an RPC error for a request to list services in a node.", + }, + { + Name: []string{"client", "api", "catalog_node_service_list"}, + Help: "", + }, + { + Name: []string{"client", "rpc", "error", "catalog_node_service_list"}, + Help: "", + }, + { + Name: []string{"client", "api", "success", "catalog_node_service_list"}, + Help: "", + }, + { + Name: []string{"client", "api", "catalog_gateway_services"}, + Help: "Increments whenever a Consul agent receives a request to list services associated with a gateway.", + }, + { + Name: []string{"client", "rpc", "error", "catalog_gateway_services"}, + Help: "Increments whenever a Consul agent receives an RPC error for a request to list services associated with a gateway.", + }, + { + Name: []string{"client", "api", "success", "catalog_gateway_services"}, + Help: "Increments whenever a Consul agent successfully responds to a request to list services associated with a gateway.", + }, +} + func (s *HTTPHandlers) CatalogRegister(resp http.ResponseWriter, req *http.Request) (interface{}, error) { metrics.IncrCounterWithLabels([]string{"client", "api", "catalog_register"}, 1, []metrics.Label{{Name: "node", Value: s.nodeName()}}) diff --git a/agent/config/builder.go b/agent/config/builder.go index 1c1798dc76..062fb440f7 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -17,6 +17,7 @@ import ( "strings" "time" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/go-bexpr" "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-multierror" @@ -942,13 +943,15 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) { DisableHostname: b.boolVal(c.Telemetry.DisableHostname), DogstatsdAddr: b.stringVal(c.Telemetry.DogstatsdAddr), DogstatsdTags: c.Telemetry.DogstatsdTags, - PrometheusRetentionTime: b.durationVal("prometheus_retention_time", c.Telemetry.PrometheusRetentionTime), FilterDefault: b.boolVal(c.Telemetry.FilterDefault), AllowedPrefixes: telemetryAllowedPrefixes, BlockedPrefixes: telemetryBlockedPrefixes, MetricsPrefix: b.stringVal(c.Telemetry.MetricsPrefix), StatsdAddr: b.stringVal(c.Telemetry.StatsdAddr), StatsiteAddr: b.stringVal(c.Telemetry.StatsiteAddr), + PrometheusOpts: prometheus.PrometheusOpts{ + Expiration: b.durationVal("prometheus_retention_time", c.Telemetry.PrometheusRetentionTime), + }, }, // Agent diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index aba609ec16..2dff70d1bf 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -18,6 +18,7 @@ import ( "testing" "time" + "github.com/armon/go-metrics/prometheus" "github.com/stretchr/testify/require" "github.com/hashicorp/consul/agent/cache" @@ -7103,9 +7104,11 @@ func TestFullConfig(t *testing.T) { AllowedPrefixes: []string{"oJotS8XJ"}, BlockedPrefixes: []string{"cazlEhGn"}, MetricsPrefix: "ftO6DySn", - PrometheusRetentionTime: 15 * time.Second, StatsdAddr: "drce87cy", StatsiteAddr: "HpFwKB8R", + PrometheusOpts: prometheus.PrometheusOpts{ + Expiration: 15 * time.Second, + }, }, TLSCipherSuites: []uint16{tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256}, TLSMinVersion: "pAOWafkR", @@ -7814,9 +7817,15 @@ func TestSanitize(t *testing.T) { "DogstatsdTags": [], "FilterDefault": false, "MetricsPrefix": "", - "PrometheusRetentionTime": "0s", "StatsdAddr": "", - "StatsiteAddr": "" + "StatsiteAddr": "", + "PrometheusOpts": { + "Expiration": "0s", + "Registerer": null, + "GaugeDefinitions": [], + "CounterDefinitions": [], + "SummaryDefinitions": [] + } }, "TranslateWANAddrs": false, "TxnMaxReqLen": 5678000000000000, diff --git a/agent/consul/acl.go b/agent/consul/acl.go index 7796c3756c..1ec7bc4193 100644 --- a/agent/consul/acl.go +++ b/agent/consul/acl.go @@ -6,7 +6,8 @@ import ( "sync" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/logging" @@ -15,6 +16,32 @@ import ( "golang.org/x/time/rate" ) +var ACLCounters = []prometheus.CounterDefinition{ + { + Name: []string{"acl", "token", "cache_hit"}, + Help: "", + }, + { + Name: []string{"acl", "token", "cache_miss"}, + Help: "", + }, +} + +var ACLSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"acl", "resolveTokenLegacy"}, + Help: "", + }, + { + Name: []string{"acl", "ResolveToken"}, + Help: "", + }, + { + Name: []string{"acl", "ResolveTokenToIdentity"}, + Help: "", + }, +} + // These must be kept in sync with the constants in command/agent/acl.go. const ( // anonymousToken is the token ID we re-write to if there is no token ID diff --git a/agent/consul/acl_endpoint.go b/agent/consul/acl_endpoint.go index ccc9e1b2af..b8ba08e0b2 100644 --- a/agent/consul/acl_endpoint.go +++ b/agent/consul/acl_endpoint.go @@ -11,7 +11,8 @@ import ( "regexp" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/authmethod" "github.com/hashicorp/consul/agent/consul/state" @@ -30,6 +31,73 @@ const ( aclBootstrapReset = "acl-bootstrap-reset" ) +var ACLEndpointSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"acl", "token", "clone"}, + Help: "", + }, + { + Name: []string{"acl", "token", "upsert"}, + Help: "", + }, + { + Name: []string{"acl", "token", "delete"}, + Help: "", + }, + { + Name: []string{"acl", "policy", "upsert"}, + Help: "", + }, + { + Name: []string{"acl", "policy", "delete"}, + Help: "", + }, + { + Name: []string{"acl", "policy", "delete"}, + Help: "", + }, + { + Name: []string{"acl", "role", "upsert"}, + Help: "", + }, + { + Name: []string{"acl", "role", "delete"}, + Help: "", + }, + { + Name: []string{"acl", "bindingrule", "upsert"}, + Help: "", + }, + { + Name: []string{"acl", "bindingrule", "delete"}, + Help: "", + }, + { + Name: []string{"acl", "authmethod", "upsert"}, + Help: "", + }, + { + Name: []string{"acl", "authmethod", "delete"}, + Help: "", + }, + { + Name: []string{"acl", "login"}, + Help: "", + }, + { + Name: []string{"acl", "login"}, + Help: "", + }, + { + Name: []string{"acl", "logout"}, + Help: "", + }, + { + Name: []string{"acl", "logout"}, + Help: "", + }, +} + // Regex for matching var ( validPolicyName = regexp.MustCompile(`^[A-Za-z0-9\-_]{1,128}$`) diff --git a/agent/consul/acl_endpoint_legacy.go b/agent/consul/acl_endpoint_legacy.go index 22838aca0d..9cdfba668b 100644 --- a/agent/consul/acl_endpoint_legacy.go +++ b/agent/consul/acl_endpoint_legacy.go @@ -5,6 +5,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" @@ -12,6 +13,13 @@ import ( "github.com/hashicorp/go-memdb" ) +var ACLEndpointLegacySummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"acl", "apply"}, + Help: "Measures the time it takes to complete an update to the ACL store.", + }, +} + // Bootstrap is used to perform a one-time ACL bootstrap operation on // a cluster to get the first management token. func (a *ACL) Bootstrap(args *structs.DCSpecificRequest, reply *structs.ACL) error { diff --git a/agent/consul/autopilot.go b/agent/consul/autopilot.go index dc5aa5da70..cc6cf62302 100644 --- a/agent/consul/autopilot.go +++ b/agent/consul/autopilot.go @@ -5,6 +5,7 @@ import ( "fmt" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/types" "github.com/hashicorp/raft" @@ -12,6 +13,17 @@ import ( "github.com/hashicorp/serf/serf" ) +var AutopilotGauges = []prometheus.GaugeDefinition{ + { + Name: []string{"autopilot", "failure_tolerance"}, + Help: "Tracks the number of voting servers that the cluster can lose while continuing to function.", + }, + { + Name: []string{"autopilot", "healthy"}, + Help: "Tracks the overall health of the local server cluster. 1 if all servers are healthy, 0 if one or more are unhealthy.", + }, +} + // AutopilotDelegate is a Consul delegate for autopilot operations. type AutopilotDelegate struct { server *Server diff --git a/agent/consul/catalog_endpoint.go b/agent/consul/catalog_endpoint.go index 04be323cb5..f5d5b5633d 100644 --- a/agent/consul/catalog_endpoint.go +++ b/agent/consul/catalog_endpoint.go @@ -6,6 +6,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" @@ -17,6 +18,52 @@ import ( "github.com/hashicorp/go-uuid" ) +var CatalogCounters = []prometheus.CounterDefinition{ + { + Name: []string{"catalog", "service", "query"}, + Help: "Increments for each catalog query for the given service.", + }, + { + Name: []string{"catalog", "connect", "query"}, + Help: "", + }, + { + Name: []string{"catalog", "service", "query-tag"}, + Help: "Increments for each catalog query for the given service with the given tag.", + }, + { + Name: []string{"catalog", "connect", "query-tag"}, + Help: "", + }, + { + Name: []string{"catalog", "service", "query-tags"}, + Help: "Increments for each catalog query for the given service with the given tags.", + }, + { + Name: []string{"catalog", "connect", "query-tags"}, + Help: "", + }, + { + Name: []string{"catalog", "service", "not-found"}, + Help: "Increments for each catalog query where the given service could not be found.", + }, + { + Name: []string{"catalog", "connect", "not-found"}, + Help: "", + }, +} + +var CatalogSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"catalog", "deregister"}, + Help: "Measures the time it takes to complete a catalog deregister operation.", + }, + { + Name: []string{"catalog", "register"}, + Help: "Measures the time it takes to complete a catalog register operation.", + }, +} + // Catalog endpoint is used to manipulate the service catalog type Catalog struct { srv *Server diff --git a/agent/consul/client.go b/agent/consul/client.go index b4cf90759b..d2ae9a1edd 100644 --- a/agent/consul/client.go +++ b/agent/consul/client.go @@ -9,6 +9,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/agent/pool" "github.com/hashicorp/consul/agent/router" "github.com/hashicorp/consul/agent/structs" @@ -21,6 +22,21 @@ import ( "golang.org/x/time/rate" ) +var ClientCounters = []prometheus.CounterDefinition{ + { + Name: []string{"client", "rpc"}, + Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server.", + }, + { + Name: []string{"client", "rpc", "exceeded"}, + Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server gets rate limited by that agent's limits configuration.", + }, + { + Name: []string{"client", "rpc", "failed"}, + Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server and fails.", + }, +} + const ( // serfEventBacklog is the maximum number of unprocessed Serf Events // that will be held in queue before new serf events block. A diff --git a/agent/consul/config_endpoint.go b/agent/consul/config_endpoint.go index 3d3c0ff354..b2529133d5 100644 --- a/agent/consul/config_endpoint.go +++ b/agent/consul/config_endpoint.go @@ -4,6 +4,8 @@ import ( "fmt" "time" + "github.com/armon/go-metrics/prometheus" + metrics "github.com/armon/go-metrics" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" @@ -12,6 +14,33 @@ import ( "github.com/mitchellh/copystructure" ) +var ConfigSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"config_entry", "apply"}, + Help: "", + }, + { + Name: []string{"config_entry", "get"}, + Help: "", + }, + { + Name: []string{"config_entry", "list"}, + Help: "", + }, + { + Name: []string{"config_entry", "listAll"}, + Help: "", + }, + { + Name: []string{"config_entry", "delete"}, + Help: "", + }, + { + Name: []string{"config_entry", "resolve_service_config"}, + Help: "", + }, +} + // The ConfigEntry endpoint is used to query centralized config information type ConfigEntry struct { srv *Server diff --git a/agent/consul/federation_state_endpoint.go b/agent/consul/federation_state_endpoint.go index a98ab83e8f..88111364c1 100644 --- a/agent/consul/federation_state_endpoint.go +++ b/agent/consul/federation_state_endpoint.go @@ -5,13 +5,33 @@ import ( "fmt" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" memdb "github.com/hashicorp/go-memdb" ) +var FederationStateSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"federation_state", "apply"}, + Help: "", + }, + { + Name: []string{"federation_state", "get"}, + Help: "", + }, + { + Name: []string{"federation_state", "list"}, + Help: "", + }, + { + Name: []string{"federation_state", "list_mesh_gateways"}, + Help: "", + }, +} + var ( errFederationStatesNotEnabled = errors.New("Federation states are currently disabled until all servers in the datacenter support the feature") ) diff --git a/agent/consul/fsm/commands_oss.go b/agent/consul/fsm/commands_oss.go index a914009a4d..fae5eb1a81 100644 --- a/agent/consul/fsm/commands_oss.go +++ b/agent/consul/fsm/commands_oss.go @@ -4,11 +4,102 @@ import ( "fmt" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/api" ) +var CommandsSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"fsm", "register"}, + Help: "Measures the time it takes to apply a catalog register operation to the FSM.", + }, + { + Name: []string{"fsm", "deregister"}, + Help: "Measures the time it takes to apply a catalog deregister operation to the FSM.", + }, + { + Name: []string{"fsm", "kvs"}, + Help: "Measures the time it takes to apply the given KV operation to the FSM.", + }, + { + Name: []string{"fsm", "session"}, + Help: "Measures the time it takes to apply the given session operation to the FSM.", + }, + { + Name: []string{"fsm", "acl"}, + Help: "Measures the time it takes to apply the given ACL operation to the FSM.", + }, + { + Name: []string{"fsm", "tombstone"}, + Help: "Measures the time it takes to apply the given tombstone operation to the FSM.", + }, + { + Name: []string{"fsm", "coordinate", "batch-update"}, + Help: "Measures the time it takes to apply the given batch coordinate update to the FSM.", + }, + { + Name: []string{"fsm", "prepared-query"}, + Help: "Measures the time it takes to apply the given prepared query update operation to the FSM.", + }, + { + Name: []string{"fsm", "txn"}, + Help: "Measures the time it takes to apply the given transaction update to the FSM.", + }, + { + Name: []string{"fsm", "autopilot"}, + Help: "Measures the time it takes to apply the given autopilot update to the FSM.", + }, + { + Name: []string{"consul", "fsm", "intention"}, + Help: "", + }, + { + Name: []string{"fsm", "intention"}, + Help: "", + }, + { + Name: []string{"consul", "fsm", "ca"}, + Help: "", + }, + { + Name: []string{"fsm", "ca", "leaf"}, + Help: "", + }, + { + Name: []string{"fsm", "acl", "token"}, + Help: "", + }, + { + Name: []string{"fsm", "ca", "leaf"}, + Help: "", + }, + { + Name: []string{"fsm", "acl", "policy"}, + Help: "", + }, + { + Name: []string{"fsm", "acl", "bindingrule"}, + Help: "", + }, + { + Name: []string{"fsm", "acl", "authmethod"}, + Help: "", + }, + { + Name: []string{"fsm", "system_metadata"}, + Help: "", + }, + // TODO(kit): We generate the config-entry fsm summaries by reading off of the request. It is + // possible to statically declare these when we know all of the names, but I didn't get to it + // in this patch. Config-entries are known though and we should add these in the future. + // { + // Name: []string{"fsm", "config_entry", req.Entry.GetKind()}, + // Help: "", + // }, +} + func init() { registerCommand(structs.RegisterRequestType, (*FSM).applyRegister) registerCommand(structs.DeregisterRequestType, (*FSM).applyDeregister) diff --git a/agent/consul/fsm/snapshot.go b/agent/consul/fsm/snapshot.go index e4c9c0bb45..696ca56453 100644 --- a/agent/consul/fsm/snapshot.go +++ b/agent/consul/fsm/snapshot.go @@ -5,6 +5,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/go-msgpack/codec" @@ -12,6 +13,13 @@ import ( "github.com/hashicorp/raft" ) +var SnapshotSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"fsm", "persist"}, + Help: "Measures the time it takes to persist the FSM to a raft snapshot.", + }, +} + // snapshot is used to provide a snapshot of the current // state in a way that can be accessed concurrently with operations // that may modify the live state. diff --git a/agent/consul/intention_endpoint.go b/agent/consul/intention_endpoint.go index 9b1931e00f..d96e17c268 100644 --- a/agent/consul/intention_endpoint.go +++ b/agent/consul/intention_endpoint.go @@ -6,6 +6,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/consul/state" @@ -16,6 +17,17 @@ import ( "github.com/hashicorp/go-memdb" ) +var IntentionSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"consul", "intention", "apply"}, + Help: "", + }, + { + Name: []string{"intention", "apply"}, + Help: "", + }, +} + var ( // ErrIntentionNotFound is returned if the intention lookup failed. ErrIntentionNotFound = errors.New("Intention not found") diff --git a/agent/consul/kvs_endpoint.go b/agent/consul/kvs_endpoint.go index 04dee57b62..c6aee93805 100644 --- a/agent/consul/kvs_endpoint.go +++ b/agent/consul/kvs_endpoint.go @@ -6,6 +6,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" @@ -14,6 +15,13 @@ import ( "github.com/hashicorp/go-memdb" ) +var KVSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"kvs", "apply"}, + Help: "Measures the time it takes to complete an update to the KV store.", + }, +} + // KVS endpoint is used to manipulate the Key-Value store type KVS struct { srv *Server diff --git a/agent/consul/leader.go b/agent/consul/leader.go index a1d90131ae..d050e297b4 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -11,6 +11,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" @@ -27,6 +28,21 @@ import ( "golang.org/x/time/rate" ) +var LeaderSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"leader", "barrier"}, + Help: "Measures the time spent waiting for the raft barrier upon gaining leadership.", + }, + { + Name: []string{"leader", "reconcileMember"}, + Help: "Measures the time spent updating the raft store for a single serf member's information.", + }, + { + Name: []string{"leader", "reapTombstones"}, + Help: "Measures the time spent clearing tombstones.", + }, +} + const ( newLeaderEvent = "consul:new-leader" barrierWriteTimeout = 2 * time.Minute diff --git a/agent/consul/prepared_query_endpoint.go b/agent/consul/prepared_query_endpoint.go index bb13ff3cb3..360c80b9b7 100644 --- a/agent/consul/prepared_query_endpoint.go +++ b/agent/consul/prepared_query_endpoint.go @@ -6,6 +6,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" @@ -15,6 +16,25 @@ import ( "github.com/hashicorp/go-uuid" ) +var PreparedQuerySummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"prepared-query", "apply"}, + Help: "Measures the time it takes to apply a prepared query update.", + }, + { + Name: []string{"prepared-query", "explain"}, + Help: "Measures the time it takes to process a prepared query explain request.", + }, + { + Name: []string{"prepared-query", "execute"}, + Help: "Measures the time it takes to process a prepared query execute request.", + }, + { + Name: []string{"prepared-query", "execute_remote"}, + Help: "Measures the time it takes to process a prepared query execute request that was forwarded to another datacenter.", + }, +} + // PreparedQuery manages the prepared query endpoint. type PreparedQuery struct { srv *Server diff --git a/agent/consul/rpc.go b/agent/consul/rpc.go index ac1096292b..82a656a3a4 100644 --- a/agent/consul/rpc.go +++ b/agent/consul/rpc.go @@ -13,6 +13,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/wanfed" @@ -31,6 +32,47 @@ import ( "github.com/hashicorp/yamux" ) +var RPCCounters = []prometheus.CounterDefinition{ + { + Name: []string{"rpc", "accept_conn"}, + Help: "Increments when a server accepts an RPC connection.", + }, + { + Name: []string{"rpc", "raft_handoff"}, + Help: "Increments when a server accepts a Raft-related RPC connection.", + }, + { + Name: []string{"rpc", "request_error"}, + Help: "Increments when a server returns an error from an RPC request.", + }, + { + Name: []string{"rpc", "request"}, + Help: "Increments when a server receives a Consul-related RPC request.", + }, + { + Name: []string{"rpc", "cross-dc"}, + Help: "Increments when a server sends a (potentially blocking) cross datacenter RPC query.", + }, + { + Name: []string{"rpc", "query"}, + Help: "Increments when a server receives a new blocking RPC request, indicating the rate of new blocking query calls.", + }, +} + +var RPCGauges = []prometheus.GaugeDefinition{ + { + Name: []string{"rpc", "queries_blocking"}, + Help: "Shows the current number of in-flight blocking queries the server is handling.", + }, +} + +var RPCSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"rpc", "consistentRead"}, + Help: "Measures the time spent confirming that a consistent read can be performed.", + }, +} + const ( // jitterFraction is a the limit to the amount of jitter we apply // to a user specified MaxQueryTime. We divide the specified time by diff --git a/agent/consul/segment_oss.go b/agent/consul/segment_oss.go index 11b06b6959..690132c347 100644 --- a/agent/consul/segment_oss.go +++ b/agent/consul/segment_oss.go @@ -7,10 +7,18 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/serf/serf" ) +var SegmentOSSSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"leader", "reconcile"}, + Help: "Measures the time spent updating the raft store from the serf member information.", + }, +} + // LANMembersAllSegments returns members from all segments. func (s *Server) LANMembersAllSegments() ([]serf.Member, error) { return s.LANMembers(), nil diff --git a/agent/consul/server.go b/agent/consul/server.go index 13fece4060..5db589d3a1 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -17,7 +17,7 @@ import ( "sync/atomic" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" connlimit "github.com/hashicorp/go-connlimit" "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" @@ -50,6 +50,8 @@ import ( "github.com/hashicorp/consul/types" ) +// NOTE The "consul.client.rpc" and "consul.client.rpc.exceeded" counters are defined in consul/client.go + // These are the protocol versions that Consul can _understand_. These are // Consul-level protocol versions, that are used to configure the Serf // protocol versions. diff --git a/agent/consul/session_endpoint.go b/agent/consul/session_endpoint.go index 3ac8b41dc0..d3d3604883 100644 --- a/agent/consul/session_endpoint.go +++ b/agent/consul/session_endpoint.go @@ -5,6 +5,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" @@ -13,6 +14,17 @@ import ( "github.com/hashicorp/go-uuid" ) +var SessionEndpointSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"session", "apply"}, + Help: "Measures the time spent applying a session update.", + }, + { + Name: []string{"session", "renew"}, + Help: "Measures the time spent renewing a session.", + }, +} + // Session endpoint is used to manipulate sessions for KV type Session struct { srv *Server diff --git a/agent/consul/session_ttl.go b/agent/consul/session_ttl.go index 4afdc0e382..15c77a24a2 100644 --- a/agent/consul/session_ttl.go +++ b/agent/consul/session_ttl.go @@ -5,9 +5,32 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/agent/structs" ) +var SessionGauges = []prometheus.GaugeDefinition{ + { + Name: []string{"session_ttl", "active"}, + Help: "Tracks the active number of sessions being tracked.", + }, + { + Name: []string{"raft", "applied_index"}, + Help: "", + }, + { + Name: []string{"raft", "last_index"}, + Help: "", + }, +} + +var SessionSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"session_ttl", "invalidate"}, + Help: "Measures the time spent invalidating an expired session.", + }, +} + const ( // maxInvalidateAttempts limits how many invalidate attempts are made maxInvalidateAttempts = 6 diff --git a/agent/consul/txn_endpoint.go b/agent/consul/txn_endpoint.go index 9819d63704..9febc8b89f 100644 --- a/agent/consul/txn_endpoint.go +++ b/agent/consul/txn_endpoint.go @@ -5,12 +5,24 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/api" "github.com/hashicorp/go-hclog" ) +var TxnSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"txn", "apply"}, + Help: "Measures the time spent applying a transaction operation.", + }, + { + Name: []string{"txn", "read"}, + Help: "Measures the time spent returning a read transaction.", + }, +} + // Txn endpoint is used to perform multi-object atomic transactions. type Txn struct { srv *Server diff --git a/agent/consul/usagemetrics/usagemetrics.go b/agent/consul/usagemetrics/usagemetrics.go index 259c6646e1..da09890e5f 100644 --- a/agent/consul/usagemetrics/usagemetrics.go +++ b/agent/consul/usagemetrics/usagemetrics.go @@ -5,12 +5,29 @@ import ( "errors" "time" + "github.com/armon/go-metrics/prometheus" + "github.com/armon/go-metrics" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" ) +var Gauges = []prometheus.GaugeDefinition{ + { + Name: []string{"consul", "state", "nodes"}, + Help: "Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0.", + }, + { + Name: []string{"consul", "state", "services"}, + Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.", + }, + { + Name: []string{"consul", "state", "service_instances"}, + Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.", + }, +} + // Config holds the settings for various parameters for the // UsageMetricsReporter type Config struct { diff --git a/agent/dns.go b/agent/dns.go index a9063e26f4..d8e20003dd 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -10,6 +10,8 @@ import ( "sync/atomic" "time" + "github.com/armon/go-metrics/prometheus" + metrics "github.com/armon/go-metrics" radix "github.com/armon/go-radix" "github.com/coredns/coredns/plugin/pkg/dnsutil" @@ -26,6 +28,24 @@ import ( "github.com/hashicorp/consul/logging" ) +var DNSCounters = []prometheus.CounterDefinition{ + { + Name: []string{"dns", "stale_queries"}, + Help: "Increments when an agent serves a query within the allowed stale threshold.", + }, +} + +var DNSSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"dns", "ptr_query"}, + Help: "Measures the time spent handling a reverse DNS query for the given node.", + }, + { + Name: []string{"dns", "domain_query"}, + Help: "Measures the time spent handling a domain query for the given node.", + }, +} + const ( // UDP can fit ~25 A records in a 512B response, and ~14 AAAA // records. Limit further to prevent unintentional configuration diff --git a/agent/grpc/stats.go b/agent/grpc/stats.go index eeb8eb379d..7ba96f91f4 100644 --- a/agent/grpc/stats.go +++ b/agent/grpc/stats.go @@ -5,10 +5,48 @@ import ( "sync/atomic" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "google.golang.org/grpc" "google.golang.org/grpc/stats" ) +var StatsGauges = []prometheus.GaugeDefinition{ + { + Name: []string{"grpc", "server", "connections"}, + Help: "Measures the number of active gRPC connections open on the server.", + }, + { + Name: []string{"grpc", "client", "connections"}, + Help: "Measures the number of active gRPC connections open from the client agent to any Consul servers.", + }, + { + Name: []string{"grpc", "server", "streams"}, + Help: "Measures the number of active gRPC streams handled by the server.", + }, +} +var StatsCounters = []prometheus.CounterDefinition{ + { + Name: []string{"grpc", "client", "request", "count"}, + Help: "Counts the number of gRPC requests made by the client agent to a Consul server.", + }, + { + Name: []string{"grpc", "server", "request", "count"}, + Help: "Counts the number of gRPC requests received by the server.", + }, + { + Name: []string{"grpc", "client", "connection", "count"}, + Help: "Counts the number of new gRPC connections opened by the client agent to a Consul server.", + }, + { + Name: []string{"grpc", "server", "connection", "count"}, + Help: "Counts the number of new gRPC connections received by the server.", + }, + { + Name: []string{"grpc", "server", "stream", "count"}, + Help: "Counts the number of new gRPC streams received by the server.", + }, +} + var defaultMetrics = metrics.Default // statsHandler is a grpc/stats.StatsHandler which emits connection and diff --git a/agent/http.go b/agent/http.go index c1c2b5b1a9..93c6cdc8f7 100644 --- a/agent/http.go +++ b/agent/http.go @@ -17,6 +17,7 @@ import ( "github.com/NYTimes/gziphandler" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/config" @@ -31,6 +32,13 @@ import ( "github.com/pkg/errors" ) +var HTTPSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"api", "http"}, + Help: "Samples how long it takes to service the given HTTP request for the given verb and path.", + }, +} + // MethodNotAllowedError should be returned by a handler when the HTTP method is not allowed. type MethodNotAllowedError struct { Method string diff --git a/agent/local/state.go b/agent/local/state.go index be0c481f3d..b4414e9109 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -9,8 +9,8 @@ import ( "sync/atomic" "time" - metrics "github.com/armon/go-metrics" - + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" @@ -20,6 +20,33 @@ import ( "github.com/hashicorp/go-hclog" ) +var StateCounters = []prometheus.CounterDefinition{ + { + Name: []string{"acl", "blocked", "service", "registration"}, + Help: "Increments whenever a registration fails for a service (blocked by an ACL)", + }, + { + Name: []string{"acl", "blocked", "service", "deregistration"}, + Help: "Increments whenever a deregistration fails for a service (blocked by an ACL)", + }, + { + Name: []string{"acl", "blocked", "check", "registration"}, + Help: "Increments whenever a registration fails for a check (blocked by an ACL)", + }, + { + Name: []string{"acl", "blocked", "check", "deregistration"}, + Help: "Increments whenever a deregistration fails for a check (blocked by an ACL)", + }, + { + Name: []string{"acl", "blocked", "node", "registration"}, + Help: "Increments whenever a registration fails for a node (blocked by an ACL)", + }, + { + Name: []string{"acl", "blocked", "node", "deregistration"}, + Help: "Increments whenever a deregistration fails for a node (blocked by an ACL)", + }, +} + const fullSyncReadMaxStale = 2 * time.Second // Config is the configuration for the State. diff --git a/agent/setup.go b/agent/setup.go index 96265ef24a..9efc565c92 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -8,6 +8,12 @@ import ( "sync" "time" + "github.com/hashicorp/consul/agent/consul/fsm" + + "github.com/armon/go-metrics/prometheus" + "github.com/hashicorp/consul/agent/consul/usagemetrics" + "github.com/hashicorp/consul/agent/local" + "github.com/hashicorp/go-hclog" "google.golang.org/grpc/grpclog" grpcresolver "google.golang.org/grpc/resolver" @@ -72,6 +78,10 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer) (BaseDeps, error) return d, fmt.Errorf("failed to setup node ID: %w", err) } + gauges, counters, summaries := getPrometheusDefs(cfg.Telemetry) + cfg.Telemetry.PrometheusOpts.GaugeDefinitions = gauges + cfg.Telemetry.PrometheusOpts.CounterDefinitions = counters + cfg.Telemetry.PrometheusOpts.SummaryDefinitions = summaries d.MetricsHandler, err = lib.InitTelemetry(cfg.Telemetry) if err != nil { return d, fmt.Errorf("failed to initialize telemetry: %w", err) @@ -177,3 +187,119 @@ func registerWithGRPC(b grpcresolver.Builder) { defer registerLock.Unlock() grpcresolver.Register(b) } + +// getPrometheusDefs reaches into every slice of prometheus defs we've defined in each part of the agent, and appends +// all of our slices into one nice slice of definitions per metric type for the Consul agent to pass to go-metrics. +func getPrometheusDefs(cfg lib.TelemetryConfig) ([]prometheus.GaugeDefinition, []prometheus.CounterDefinition, []prometheus.SummaryDefinition) { + // Build slice of slices for all gauge definitions + var gauges = [][]prometheus.GaugeDefinition{ + cache.Gauges, + consul.AutopilotGauges, + consul.RPCGauges, + consul.SessionGauges, + grpc.StatsGauges, + usagemetrics.Gauges, + } + // Flatten definitions + // NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique? + var gaugeDefs []prometheus.GaugeDefinition + for _, g := range gauges { + // Set Consul to each definition's namespace + // TODO(kit): Prepending the service to each definition should be handled by go-metrics + var withService []prometheus.GaugeDefinition + for _, gauge := range g { + gauge.Name = append([]string{cfg.MetricsPrefix}, gauge.Name...) + withService = append(withService, gauge) + } + gaugeDefs = append(gaugeDefs, withService...) + } + + raftCounters := []prometheus.CounterDefinition{ + // TODO(kit): "raft..." metrics come from the raft lib and we should migrate these to a telemetry + // package within. In the mean time, we're going to define a few here because they're key to monitoring Consul. + { + Name: []string{"raft", "apply"}, + Help: "This counts the number of Raft transactions occurring over the interval.", + }, + { + Name: []string{"raft", "state", "candidate"}, + Help: "This increments whenever a Consul server starts an election.", + }, + { + Name: []string{"raft", "state", "leader"}, + Help: "This increments whenever a Consul server becomes a leader.", + }, + } + + var counters = [][]prometheus.CounterDefinition{ + CatalogCounters, + cache.Counters, + consul.ACLCounters, + consul.CatalogCounters, + consul.ClientCounters, + consul.RPCCounters, + grpc.StatsCounters, + local.StateCounters, + raftCounters, + } + // Flatten definitions + // NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique? + var counterDefs []prometheus.CounterDefinition + for _, c := range counters { + // TODO(kit): Prepending the service to each definition should be handled by go-metrics + var withService []prometheus.CounterDefinition + for _, counter := range c { + counter.Name = append([]string{cfg.MetricsPrefix}, counter.Name...) + withService = append(withService, counter) + } + counterDefs = append(counterDefs, withService...) + } + + raftSummaries := []prometheus.SummaryDefinition{ + // TODO(kit): "raft..." metrics come from the raft lib and we should migrate these to a telemetry + // package within. In the mean time, we're going to define a few here because they're key to monitoring Consul. + { + Name: []string{"raft", "commitTime"}, + Help: "This measures the time it takes to commit a new entry to the Raft log on the leader.", + }, + { + Name: []string{"raft", "leader", "lastContact"}, + Help: "Measures the time since the leader was last able to contact the follower nodes when checking its leader lease.", + }, + } + + var summaries = [][]prometheus.SummaryDefinition{ + HTTPSummaries, + consul.ACLSummaries, + consul.ACLEndpointSummaries, + consul.ACLEndpointLegacySummaries, + consul.CatalogSummaries, + consul.FederationStateSummaries, + consul.IntentionSummaries, + consul.KVSummaries, + consul.LeaderSummaries, + consul.PreparedQuerySummaries, + consul.RPCSummaries, + consul.SegmentOSSSummaries, + consul.SessionSummaries, + consul.SessionEndpointSummaries, + consul.TxnSummaries, + fsm.CommandsSummaries, + fsm.SnapshotSummaries, + raftSummaries, + } + // Flatten definitions + // NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique? + var summaryDefs []prometheus.SummaryDefinition + for _, s := range summaries { + // TODO(kit): Prepending the service to each definition should be handled by go-metrics + var withService []prometheus.SummaryDefinition + for _, summary := range s { + summary.Name = append([]string{cfg.MetricsPrefix}, summary.Name...) + withService = append(withService, summary) + } + summaryDefs = append(summaryDefs, withService...) + } + + return gaugeDefs, counterDefs, summaryDefs +} diff --git a/connect/proxy/proxy.go b/connect/proxy/proxy.go index 9dc27a06fe..a29cf352e8 100644 --- a/connect/proxy/proxy.go +++ b/connect/proxy/proxy.go @@ -54,6 +54,8 @@ func (p *Proxy) Serve() error { // Initial setup // Setup telemetry if configured + // NOTE(kit): As far as I can tell, all of the metrics in the proxy are generated at runtime, so we + // don't have any static metrics we initialize at start. _, err := lib.InitTelemetry(newCfg.Telemetry) if err != nil { p.logger.Error("proxy telemetry config error", "error", err) diff --git a/lib/telemetry.go b/lib/telemetry.go index 33f7d21008..d85e51d45d 100644 --- a/lib/telemetry.go +++ b/lib/telemetry.go @@ -4,7 +4,7 @@ import ( "reflect" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" "github.com/armon/go-metrics/circonus" "github.com/armon/go-metrics/datadog" "github.com/armon/go-metrics/prometheus" @@ -154,14 +154,6 @@ type TelemetryConfig struct { // hcl: telemetry { dogstatsd_tags = []string } DogstatsdTags []string `json:"dogstatsd_tags,omitempty" mapstructure:"dogstatsd_tags"` - // PrometheusRetentionTime is the retention time for prometheus metrics if greater than 0. - // A value of 0 disable Prometheus support. Regarding Prometheus, it is considered a good - // practice to put large values here (such as a few days), and at least the interval between - // prometheus requests. - // - // hcl: telemetry { prometheus_retention_time = "duration" } - PrometheusRetentionTime time.Duration `json:"prometheus_retention_time,omitempty" mapstructure:"prometheus_retention_time"` - // FilterDefault is the default for whether to allow a metric that's not // covered by the filter. // @@ -199,10 +191,18 @@ type TelemetryConfig struct { // // hcl: telemetry { statsite_address = string } StatsiteAddr string `json:"statsite_address,omitempty" mapstructure:"statsite_address"` + + // PrometheusOpts provides configuration for the PrometheusSink. Currently the only configuration + // we acquire from hcl is the retention time. We also use definition slices that are set in agent setup + // before being passed to InitTelemmetry. + // + // hcl: telemetry { prometheus_retention_time = "duration" } + PrometheusOpts prometheus.PrometheusOpts } // MergeDefaults copies any non-zero field from defaults into the current // config. +// TODO(kit): We no longer use this function and can probably delete it func (c *TelemetryConfig) MergeDefaults(defaults *TelemetryConfig) { if defaults == nil { return @@ -221,6 +221,10 @@ func (c *TelemetryConfig) MergeDefaults(defaults *TelemetryConfig) { // implementing this for the types we actually have for now. Test failure // should catch the case where we add new types later. switch f.Kind() { + case reflect.Struct: + if f.Type() == reflect.TypeOf(prometheus.PrometheusOpts{}) { + continue + } case reflect.Slice: if !f.IsNil() { continue @@ -277,80 +281,12 @@ func dogstatdSink(cfg TelemetryConfig, hostname string) (metrics.MetricSink, err } func prometheusSink(cfg TelemetryConfig, hostname string) (metrics.MetricSink, error) { - if cfg.PrometheusRetentionTime.Nanoseconds() < 1 { + + if cfg.PrometheusOpts.Expiration.Nanoseconds() < 1 { return nil, nil } - // TODO(kit) define these in vars in the package/file they're used - gaugeDefs := []prometheus.GaugeDefinition{ - { - Name: []string{"consul", "autopilot", "healthy"}, - Help: "This tracks the overall health of the local server cluster. 1 if all servers are healthy, 0 if one or more are unhealthy.", - }, - } - - // TODO(kit) define these in vars in the package/file they're used - counterDefs := []prometheus.CounterDefinition{ - { - Name: []string{"consul", "raft", "apply"}, - Help: "This counts the number of Raft transactions occurring over the interval.", - }, - { - Name: []string{"consul", "raft", "state", "candidate"}, - Help: "This increments whenever a Consul server starts an election.", - }, - { - Name: []string{"consul", "raft", "state", "leader"}, - Help: "This increments whenever a Consul server becomes a leader.", - }, - { - Name: []string{"consul", "client", "api", "catalog_register"}, - Help: "Increments whenever a Consul agent receives a catalog register request.", - }, - { - Name: []string{"consul", "runtime", "total_gc_pause_ns"}, - Help: "Number of nanoseconds consumed by stop-the-world garbage collection (GC) pauses since Consul started.", - }, - { - Name: []string{"consul", "client", "rpc"}, - Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server.", - }, - { - Name: []string{"consul", "client", "rpc", "exceeded"}, - Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server gets rate limited by that agent's limits configuration.", - }, - { - Name: []string{"consul", "client", "rpc", "failed"}, - Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server and fails.", - }, - } - - // TODO(kit) define these in vars in the package/file they're used - summaryDefs := []prometheus.SummaryDefinition{ - { - Name: []string{"consul", "kvs", "apply"}, - Help: "This measures the time it takes to complete an update to the KV store.", - }, - { - Name: []string{"consul", "txn", "apply"}, - Help: "This measures the time spent applying a transaction operation.", - }, - { - Name: []string{"consul", "raft", "commitTime"}, - Help: "This measures the time it takes to commit a new entry to the Raft log on the leader.", - }, - { - Name: []string{"consul", "raft", "leader", "lastContact"}, - Help: "Measures the time since the leader was last able to contact the follower nodes when checking its leader lease.", - }, - } - prometheusOpts := prometheus.PrometheusOpts{ - Expiration: cfg.PrometheusRetentionTime, - GaugeDefinitions: gaugeDefs, - CounterDefinitions: counterDefs, - SummaryDefinitions: summaryDefs, - } - sink, err := prometheus.NewPrometheusSinkFrom(prometheusOpts) + sink, err := prometheus.NewPrometheusSinkFrom(cfg.PrometheusOpts) if err != nil { return nil, err } @@ -440,6 +376,9 @@ func InitTelemetry(cfg TelemetryConfig) (*metrics.InmemSink, error) { if err := addSink(circonusSink); err != nil { return nil, err } + if err := addSink(circonusSink); err != nil { + return nil, err + } if err := addSink(prometheusSink); err != nil { return nil, err } diff --git a/lib/telemetry_test.go b/lib/telemetry_test.go index f81b7b5c1a..4ee012f1ec 100644 --- a/lib/telemetry_test.go +++ b/lib/telemetry_test.go @@ -5,11 +5,14 @@ import ( "testing" "time" + "github.com/armon/go-metrics/prometheus" + "github.com/stretchr/testify/require" ) func makeFullTelemetryConfig(t *testing.T) TelemetryConfig { var ( + promOpts = prometheus.PrometheusOpts{} strSliceVal = []string{"foo"} strVal = "foo" intVal = int64(1 * time.Second) @@ -27,6 +30,12 @@ func makeFullTelemetryConfig(t *testing.T) TelemetryConfig { // now for brevity but will fail the test if a new field type is added since // this is likely not implemented in MergeDefaults either. switch f.Kind() { + case reflect.Struct: + if f.Type() != reflect.TypeOf(promOpts) { + t.Fatalf("unknown struct type in TelemetryConfig: actual %v, expected: %v", f.Type(), reflect.TypeOf(promOpts)) + } + // TODO(kit): This should delve into the fields and set them individually rather than using an empty struct + f.Set(reflect.ValueOf(promOpts)) case reflect.Slice: if f.Type() != reflect.TypeOf(strSliceVal) { t.Fatalf("unknown slice type in TelemetryConfig." +