From 5c09dc322ea05b2e292157ef6b7acc689e597486 Mon Sep 17 00:00:00 2001 From: Kit Patella Date: Thu, 19 Nov 2020 13:29:44 -0800 Subject: [PATCH] add telemetry and definition help entries for missing catalog and acl metrics --- agent/catalog_endpoint.go | 8 +-- agent/consul/acl.go | 6 +- website/pages/docs/agent/telemetry.mdx | 97 ++++++++++++++------------ 3 files changed, 59 insertions(+), 52 deletions(-) diff --git a/agent/catalog_endpoint.go b/agent/catalog_endpoint.go index 188c1bfb20..f906954b4b 100644 --- a/agent/catalog_endpoint.go +++ b/agent/catalog_endpoint.go @@ -86,7 +86,7 @@ var CatalogCounters = []prometheus.CounterDefinition{ }, { Name: []string{"client", "api", "error", "catalog_service_nodes"}, - Help: "", + Help: "Increments whenever a Consul agent receives an RPC error for request to list nodes offering a service.", }, { Name: []string{"client", "api", "catalog_node_services"}, @@ -102,15 +102,15 @@ var CatalogCounters = []prometheus.CounterDefinition{ }, { Name: []string{"client", "api", "catalog_node_service_list"}, - Help: "", + Help: "Increments whenever a Consul agent receives a request to list a node's registered services.", }, { Name: []string{"client", "rpc", "error", "catalog_node_service_list"}, - Help: "", + Help: "Increments whenever a Consul agent receives an RPC error for request to list a node's registered services.", }, { Name: []string{"client", "api", "success", "catalog_node_service_list"}, - Help: "", + Help: "Increments whenever a Consul agent successfully responds to a request to list a node's registered services.", }, { Name: []string{"client", "api", "catalog_gateway_services"}, diff --git a/agent/consul/acl.go b/agent/consul/acl.go index f51cd6e183..8e73b78040 100644 --- a/agent/consul/acl.go +++ b/agent/consul/acl.go @@ -19,11 +19,11 @@ import ( var ACLCounters = []prometheus.CounterDefinition{ { Name: []string{"acl", "token", "cache_hit"}, - Help: "", + Help: "Increments if Consul is able to resolve a token's identity, or a legacy token, from the cache.", }, { Name: []string{"acl", "token", "cache_miss"}, - Help: "", + Help: "Increments if Consul cannot resolve a token's identity, or a legacy token, from the cache.", }, } @@ -38,7 +38,7 @@ var ACLSummaries = []prometheus.SummaryDefinition{ }, { Name: []string{"acl", "ResolveTokenToIdentity"}, - Help: "", + Help: "This measures the time it takes to resolve an ACL token to an Identity.", }, } diff --git a/website/pages/docs/agent/telemetry.mdx b/website/pages/docs/agent/telemetry.mdx index ce1fc9d93e..2ed0254e01 100644 --- a/website/pages/docs/agent/telemetry.mdx +++ b/website/pages/docs/agent/telemetry.mdx @@ -137,48 +137,52 @@ when retrieving metrics from the built-in store using the above described signal This is a full list of metrics emitted by Consul. -| Metric | Description | Unit | Type | -| ----------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- | ------- | -| `consul.acl.blocked.service.registration` | This increments whenever a deregistration fails for a service (blocked by an ACL) | requests | counter | -| `consul.acl.blocked..registration` | This increments whenever a registration fails for an entity (check, node or service) is blocked by an ACL | requests | counter | -| `consul.api.http` | Migrated from consul.http.. this samples how long it takes to service the given HTTP request for the given verb and path. Includes labels for `path` and `method`. `path` does not include details like service or key names, for these an underscore will be present as a placeholder (eg. path=`v1.kv._`) | ms | timer | -| `consul.client.rpc` | This increments whenever a Consul agent in client mode makes an RPC request to a Consul server. This gives a measure of how much a given agent is loading the Consul servers. Currently, this is only generated by agents in client mode, not Consul servers. | requests | counter | -| `consul.client.rpc.exceeded` | This increments whenever a Consul agent in client mode makes an RPC request to a Consul server gets rate limited by that agent's [`limits`](/docs/agent/options#limits) configuration. This gives an indication that there's an abusive application making too many requests on the agent, or that the rate limit needs to be increased. Currently, this only applies to agents in client mode, not Consul servers. | rejected requests | counter | -| `consul.client.rpc.failed` | This increments whenever a Consul agent in client mode makes an RPC request to a Consul server and fails. | requests | counter | -| `consul.client.api.catalog_register.` | This increments whenever a Consul agent receives a catalog register request. | requests | counter | -| `consul.client.api.success.catalog_register.` | This increments whenever a Consul agent successfully responds to a catalog register request. | requests | counter | -| `consul.client.rpc.error.catalog_register.` | This increments whenever a Consul agent receives an RPC error for a catalog register request. | errors | counter | -| `consul.client.api.catalog_deregister.` | This increments whenever a Consul agent receives a catalog deregister request. | requests | counter | -| `consul.client.api.success.catalog_deregister.` | This increments whenever a Consul agent successfully responds to a catalog deregister request. | requests | counter | -| `consul.client.rpc.error.catalog_deregister.` | This increments whenever a Consul agent receives an RPC error for a catalog deregister request. | errors | counter | -| `consul.client.api.catalog_datacenters.` | This increments whenever a Consul agent receives a request to list datacenters in the catalog. | requests | counter | -| `consul.client.api.success.catalog_datacenters.` | This increments whenever a Consul agent successfully responds to a request to list datacenters. | requests | counter | -| `consul.client.rpc.error.catalog_datacenters.` | This increments whenever a Consul agent receives an RPC error for a request to list datacenters. | errors | counter | -| `consul.client.api.catalog_nodes.` | This increments whenever a Consul agent receives a request to list nodes from the catalog. | requests | counter | -| `consul.client.api.success.catalog_nodes.` | This increments whenever a Consul agent successfully responds to a request to list nodes. | requests | counter | -| `consul.client.rpc.error.catalog_nodes.` | This increments whenever a Consul agent receives an RPC error for a request to list nodes. | errors | counter | -| `consul.client.api.catalog_services.` | This increments whenever a Consul agent receives a request to list services from the catalog. | requests | counter | -| `consul.client.api.success.catalog_services.` | This increments whenever a Consul agent successfully responds to a request to list services. | requests | counter | -| `consul.client.rpc.error.catalog_services.` | This increments whenever a Consul agent receives an RPC error for a request to list services. | errors | counter | -| `consul.client.api.catalog_service_nodes.` | This increments whenever a Consul agent receives a request to list nodes offering a service. | requests | counter | -| `consul.client.api.success.catalog_service_nodes.` | This increments whenever a Consul agent successfully responds to a request to list nodes offering a service. | requests | counter | -| `consul.client.rpc.error.catalog_service_nodes.` | This increments whenever a Consul agent receives an RPC error for a request to list nodes offering a service. | errors | counter | -| `consul.client.api.catalog_node_services.` | This increments whenever a Consul agent receives a request to list services registered in a node. | requests | counter | -| `consul.client.api.success.catalog_node_services.` | This increments whenever a Consul agent successfully responds to a request to list services in a node. | requests | counter | -| `consul.client.rpc.error.catalog_node_services.` | This increments whenever a Consul agent receives an RPC error for a request to list services in a node. | errors | counter | -| `consul.client.api.catalog_gateway_services.` | This increments whenever a Consul agent receives a request to list services associated with a gateway. | requests | counter | -| `consul.client.api.success.catalog_gateway_services.` | This increments whenever a Consul agent successfully responds to a request to list services associated with a gateway. | requests | counter | -| `consul.client.rpc.error.catalog_gateway_services.` | This increments whenever a Consul agent receives an RPC error for a request to list services associated with a gateway. | errors | counter | -| `consul.runtime.num_goroutines` | This tracks the number of running goroutines and is a general load pressure indicator. This may burst from time to time but should return to a steady state value. | number of goroutines | gauge | -| `consul.runtime.alloc_bytes` | This measures the number of bytes allocated by the Consul process. This may burst from time to time but should return to a steady state value. | bytes | gauge | -| `consul.runtime.heap_objects` | This measures the number of objects allocated on the heap and is a general memory pressure indicator. This may burst from time to time but should return to a steady state value. | number of objects | gauge | -| `consul.state.nodes` | This measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | -| `consul.state.services` | This measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | -| `consul.state.service_instances` | This measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | -| `consul.dns.stale_queries` | This increments when an agent serves a query within the allowed stale threshold. | queries | counter | -| `consul.dns.ptr_query.` | This measures the time spent handling a reverse DNS query for the given node. | ms | timer | -| `consul.dns.domain_query.` | This measures the time spent handling a domain query for the given node. | ms | timer | -| `consul.http...` | DEPRECATED IN 1.9: This tracks how long it takes to service the given HTTP request for the given verb and path. Paths do not include details like service or key names, for these an underscore will be present as a placeholder (eg. `consul.http.GET.v1.kv._`) | ms | timer | +| Metric | Description | Unit | Type | +| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- | ------- | +| `consul.acl.blocked.{check,node,service}.deregistration` | This increments whenever a deregistration fails for an entity (check, node or service) is blocked by an ACL. | requests | counter | +| `consul.acl.blocked.{check,node,service}.registration` | This increments whenever a registration fails for an entity (check, node or service) is blocked by an ACL. | requests | counter | +| `consul.api.http` | Migrated from consul.http.. this samples how long it takes to service the given HTTP request for the given verb and path. Includes labels for `path` and `method`. `path` does not include details like service or key names, for these an underscore will be present as a placeholder (eg. path=`v1.kv._`) | ms | timer | +| `consul.client.rpc` | This increments whenever a Consul agent in client mode makes an RPC request to a Consul server. This gives a measure of how much a given agent is loading the Consul servers. Currently, this is only generated by agents in client mode, not Consul servers. | requests | counter | +| `consul.client.rpc.exceeded` | This increments whenever a Consul agent in client mode makes an RPC request to a Consul server gets rate limited by that agent's [`limits`](/docs/agent/options#limits) configuration. This gives an indication that there's an abusive application making too many requests on the agent, or that the rate limit needs to be increased. Currently, this only applies to agents in client mode, not Consul servers. | rejected requests | counter | +| `consul.client.rpc.failed` | This increments whenever a Consul agent in client mode makes an RPC request to a Consul server and fails. | requests | counter | +| `consul.client.api.catalog_register.` | This increments whenever a Consul agent receives a catalog register request. | requests | counter | +| `consul.client.api.success.catalog_register.` | This increments whenever a Consul agent successfully responds to a catalog register request. | requests | counter | +| `consul.client.rpc.error.catalog_register.` | This increments whenever a Consul agent receives an RPC error for a catalog register request. | errors | counter | +| `consul.client.api.catalog_deregister.` | This increments whenever a Consul agent receives a catalog deregister request. | requests | counter | +| `consul.client.api.success.catalog_deregister.` | This increments whenever a Consul agent successfully responds to a catalog deregister request. | requests | counter | +| `consul.client.rpc.error.catalog_deregister.` | This increments whenever a Consul agent receives an RPC error for a catalog deregister request. | errors | counter | +| `consul.client.api.catalog_datacenters.` | This increments whenever a Consul agent receives a request to list datacenters in the catalog. | requests | counter | +| `consul.client.api.success.catalog_datacenters.` | This increments whenever a Consul agent successfully responds to a request to list datacenters. | requests | counter | +| `consul.client.rpc.error.catalog_datacenters.` | This increments whenever a Consul agent receives an RPC error for a request to list datacenters. | errors | counter | +| `consul.client.api.catalog_nodes.` | This increments whenever a Consul agent receives a request to list nodes from the catalog. | requests | counter | +| `consul.client.api.success.catalog_nodes.` | This increments whenever a Consul agent successfully responds to a request to list nodes. | requests | counter | +| `consul.client.rpc.error.catalog_nodes.` | This increments whenever a Consul agent receives an RPC error for a request to list nodes. | errors | counter | +| `consul.client.api.catalog_services.` | This increments whenever a Consul agent receives a request to list services from the catalog. | requests | counter | +| `consul.client.api.success.catalog_services.` | This increments whenever a Consul agent successfully responds to a request to list services. | requests | counter | +| `consul.client.rpc.error.catalog_services.` | This increments whenever a Consul agent receives an RPC error for a request to list services. | errors | counter | +| `consul.client.api.catalog_service_nodes.` | This increments whenever a Consul agent receives a request to list nodes offering a service. | requests | counter | +| `consul.client.api.success.catalog_service_nodes.` | This increments whenever a Consul agent successfully responds to a request to list nodes offering a service. | requests | counter | +| `consul.client.api.error.catalog_service_nodes.` | Increments whenever a Consul agent receives an RPC error for request to list nodes offering a service. | requests | counter | +| `consul.client.rpc.error.catalog_service_nodes.` | This increments whenever a Consul agent receives an RPC error for a request to list nodes offering a service. | errors | counter | +| `consul.client.api.catalog_node_services.` | This increments whenever a Consul agent receives a request to list services registered in a node. | requests | counter | +| `consul.client.api.success.catalog_node_services.` | This increments whenever a Consul agent successfully responds to a request to list services in a node. | requests | counter | +| `consul.client.rpc.error.catalog_node_services.` | This increments whenever a Consul agent receives an RPC error for a request to list services in a node. | errors | counter | +| `consul.client.api.catalog_node_service_list` | Increments whenever a Consul agent receives a request to list a node's registered services. | requests | counter | +| `consul.client.rpc.error.catalog_node_service_list` | Increments whenever a Consul agent receives an RPC error for request to list a node's registered services. | errors | counter | +| `consul.client.api.success.catalog_node_service_list` | Increments whenever a Consul agent successfully responds to a request to list a node's registered services. | requests | counter | +| `consul.client.api.catalog_gateway_services.` | This increments whenever a Consul agent receives a request to list services associated with a gateway. | requests | counter | +| `consul.client.api.success.catalog_gateway_services.` | This increments whenever a Consul agent successfully responds to a request to list services associated with a gateway. | requests | counter | +| `consul.client.rpc.error.catalog_gateway_services.` | This increments whenever a Consul agent receives an RPC error for a request to list services associated with a gateway. | errors | counter | +| `consul.runtime.num_goroutines` | This tracks the number of running goroutines and is a general load pressure indicator. This may burst from time to time but should return to a steady state value. | number of goroutines | gauge | +| `consul.runtime.alloc_bytes` | This measures the number of bytes allocated by the Consul process. This may burst from time to time but should return to a steady state value. | bytes | gauge | +| `consul.runtime.heap_objects` | This measures the number of objects allocated on the heap and is a general memory pressure indicator. This may burst from time to time but should return to a steady state value. | number of objects | gauge | +| `consul.state.nodes` | This measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | +| `consul.state.services` | This measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | +| `consul.state.service_instances` | This measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge | +| `consul.dns.stale_queries` | This increments when an agent serves a query within the allowed stale threshold. | queries | counter | +| `consul.dns.ptr_query.` | This measures the time spent handling a reverse DNS query for the given node. | ms | timer | +| `consul.dns.domain_query.` | This measures the time spent handling a domain query for the given node. | ms | timer | +| `consul.http...` | DEPRECATED IN 1.9: This tracks how long it takes to service the given HTTP request for the given verb and path. Paths do not include details like service or key names, for these an underscore will be present as a placeholder (eg. `consul.http.GET.v1.kv._`) | ms | timer | ## Server Health @@ -186,6 +190,12 @@ These metrics are used to monitor the health of the Consul servers. | Metric | Description | Unit | Type | | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------- | ------- | +| `consul.acl.apply` | This measures the time it takes to complete an update to the ACL store. | ms | timer | +| `consul.acl.resolveTokenLegacy` | This measures the time it takes to resolve an ACL token using the legacy ACL system. | ms | timer | +| `consul.acl.ResolveToken` | This measures the time it takes to resolve an ACL token. | ms | timer | +| `consul.acl.ResolveTokenToIdentity` | This measures the time it takes to resolve an ACL token to an Identity. | ms | timer | +| `consul.acl.token.cache_hit` | Increments if Consul is able to resolve a token's identity, or a legacy token, from the cache. | cache read op | counter | +| `consul.acl.token.cache_miss` | Increments if Consul cannot resolve a token's identity, or a legacy token, from the cache. | cache read op | counter | | `consul.raft.fsm.snapshot` | This metric measures the time taken by the FSM to record the current state for the snapshot. | ms | timer | | `consul.raft.fsm.apply` | This metric gives the number of logs committed since the last interval. | commit logs / interval | counter | | `consul.raft.commitNumLogs` | This metric measures the count of logs processed for application to the FSM in a single batch. | logs | gauge | @@ -219,9 +229,6 @@ These metrics are used to monitor the health of the Consul servers. | `consul.raft.replication.appendEntries.rpc` | This metric measures the time taken by the append entries RFC, to replicate the log entries of a leader agent onto its follower agent(s) | ms | timer | | `consul.raft.replication.appendEntries.logs` | This metric measures the number of logs replicated to an agent, to bring it up to speed with the leader's logs. | logs appended/ interval | counter | | `consul.raft.leader.lastContact` | This will only be emitted by the Raft leader and measures the time since the leader was last able to contact the follower nodes when checking its leader lease. It can be used as a measure for how stable the Raft timing is and how close the leader is to timing out its lease.The lease timeout is 500 ms times the [`raft_multiplier` configuration](/docs/agent/options#raft_multiplier), so this telemetry value should not be getting close to that configured value, otherwise the Raft timing is marginal and might need to be tuned, or more powerful servers might be needed. See the [Server Performance](/docs/install/performance) guide for more details. | ms | timer | -| `consul.acl.apply` | This measures the time it takes to complete an update to the ACL store. | ms | timer | -| `consul.acl.resolveTokenLegacy` | This measures the time it takes to resolve an ACL token using the legacy ACL system. | ms | timer | -| `consul.acl.ResolveToken` | This measures the time it takes to resolve an ACL token. | ms | timer | | `consul.rpc.accept_conn` | This increments when a server accepts an RPC connection. | connections | counter | | `consul.catalog.register` | This measures the time it takes to complete a catalog register operation. | ms | timer | | `consul.catalog.deregister` | This measures the time it takes to complete a catalog deregister operation. | ms | timer |