Merge pull request #9198 from hashicorp/mkcp/telemetry/add-all-metric-definitions

Add metric definitions for all metrics known at Consul start
This commit is contained in:
Kit Patella 2020-11-16 15:54:50 -08:00
parent dd857bfa37
commit 88b013be99
35 changed files with 922 additions and 93 deletions

3
.changelog/9198.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
agent: All metrics should be present and available to prometheus scrapers when Consul starts. If any non-deprecated metrics are missing please submit an issue with its name.
```

View File

@ -136,7 +136,7 @@ func (s *HTTPHandlers) AgentMetrics(resp http.ResponseWriter, req *http.Request)
return nil, acl.ErrPermissionDenied return nil, acl.ErrPermissionDenied
} }
if enablePrometheusOutput(req) { if enablePrometheusOutput(req) {
if s.agent.config.Telemetry.PrometheusRetentionTime < 1 { if s.agent.config.Telemetry.PrometheusOpts.Expiration < 1 {
resp.WriteHeader(http.StatusUnsupportedMediaType) resp.WriteHeader(http.StatusUnsupportedMediaType)
fmt.Fprint(resp, "Prometheus is not enabled since its retention time is not positive") fmt.Fprint(resp, "Prometheus is not enabled since its retention time is not positive")
return nil, nil return nil, nil

31
agent/cache/cache.go vendored
View File

@ -24,6 +24,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"golang.org/x/time/rate" "golang.org/x/time/rate"
"github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/lib"
@ -32,6 +33,34 @@ import (
//go:generate mockery -all -inpkg //go:generate mockery -all -inpkg
// TODO(kit): remove the namespace from these once the metrics themselves change
var Gauges = []prometheus.GaugeDefinition{
{
Name: []string{"consul", "cache", "entries_count"},
Help: "",
},
}
// TODO(kit): remove the namespace from these once the metrics themselves change
var Counters = []prometheus.CounterDefinition{
{
Name: []string{"consul", "cache", "bypass"},
Help: "",
},
{
Name: []string{"consul", "cache", "fetch_success"},
Help: "",
},
{
Name: []string{"consul", "cache", "fetch_error"},
Help: "",
},
{
Name: []string{"consul", "cache", "evict_expired"},
Help: "",
},
}
// Constants related to refresh backoff. We probably don't ever need to // Constants related to refresh backoff. We probably don't ever need to
// make these configurable knobs since they primarily exist to lower load. // make these configurable knobs since they primarily exist to lower load.
const ( const (
@ -629,6 +658,7 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign
// Error handling // Error handling
if err == nil { if err == nil {
labels := []metrics.Label{{Name: "result_not_modified", Value: strconv.FormatBool(result.NotModified)}} labels := []metrics.Label{{Name: "result_not_modified", Value: strconv.FormatBool(result.NotModified)}}
// TODO(kit): move tEntry.Name to a label on the first write here and deprecate the second write
metrics.IncrCounterWithLabels([]string{"consul", "cache", "fetch_success"}, 1, labels) metrics.IncrCounterWithLabels([]string{"consul", "cache", "fetch_success"}, 1, labels)
metrics.IncrCounterWithLabels([]string{"consul", "cache", tEntry.Name, "fetch_success"}, 1, labels) metrics.IncrCounterWithLabels([]string{"consul", "cache", tEntry.Name, "fetch_success"}, 1, labels)
@ -658,6 +688,7 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign
newEntry.RefreshLostContact = time.Time{} newEntry.RefreshLostContact = time.Time{}
} }
} else { } else {
// TODO(kit): Add tEntry.Name to label on fetch_error and deprecate second write
metrics.IncrCounter([]string{"consul", "cache", "fetch_error"}, 1) metrics.IncrCounter([]string{"consul", "cache", "fetch_error"}, 1)
metrics.IncrCounter([]string{"consul", "cache", tEntry.Name, "fetch_error"}, 1) metrics.IncrCounter([]string{"consul", "cache", tEntry.Name, "fetch_error"}, 1)

View File

@ -5,11 +5,127 @@ import (
"net/http" "net/http"
"strings" "strings"
metrics "github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
cachetype "github.com/hashicorp/consul/agent/cache-types" cachetype "github.com/hashicorp/consul/agent/cache-types"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
) )
var CatalogCounters = []prometheus.CounterDefinition{
{
Name: []string{"client", "api", "catalog_register"},
Help: "Increments whenever a Consul agent receives a catalog register request.",
},
{
Name: []string{"client", "rpc", "error", "catalog_register"},
Help: "Increments whenever a Consul agent receives an RPC error for a catalog register request.",
},
{
Name: []string{"client", "api", "success", "catalog_register"},
Help: "Increments whenever a Consul agent successfully responds to a catalog register request.",
},
{
Name: []string{"client", "api", "catalog_deregister"},
Help: "Increments whenever a Consul agent receives a catalog deregister request.",
},
{
Name: []string{"client", "api", "catalog_datacenters"},
Help: "Increments whenever a Consul agent receives a request to list datacenters in the catalog.",
},
{
Name: []string{"client", "rpc", "error", "catalog_deregister"},
Help: "Increments whenever a Consul agent receives an RPC error for a catalog deregister request.",
},
{
Name: []string{"client", "api", "success", "catalog_nodes"},
Help: "Increments whenever a Consul agent successfully responds to a request to list nodes.",
},
{
Name: []string{"client", "rpc", "error", "catalog_nodes"},
Help: "Increments whenever a Consul agent receives an RPC error for a request to list nodes.",
},
{
Name: []string{"client", "api", "success", "catalog_deregister"},
Help: "Increments whenever a Consul agent successfully responds to a catalog deregister request.",
},
{
Name: []string{"client", "rpc", "error", "catalog_datacenters"},
Help: "Increments whenever a Consul agent receives an RPC error for a request to list datacenters.",
},
{
Name: []string{"client", "api", "success", "catalog_datacenters"},
Help: "Increments whenever a Consul agent successfully responds to a request to list datacenters.",
},
{
Name: []string{"client", "api", "catalog_nodes"},
Help: "Increments whenever a Consul agent receives a request to list nodes from the catalog.",
},
{
Name: []string{"client", "api", "catalog_services"},
Help: "Increments whenever a Consul agent receives a request to list services from the catalog.",
},
{
Name: []string{"client", "rpc", "error", "catalog_services"},
Help: "Increments whenever a Consul agent receives an RPC error for a request to list services.",
},
{
Name: []string{"client", "api", "success", "catalog_services"},
Help: "Increments whenever a Consul agent successfully responds to a request to list services.",
},
{
Name: []string{"client", "api", "catalog_service_nodes"},
Help: "Increments whenever a Consul agent receives a request to list nodes offering a service.",
},
{
Name: []string{"client", "rpc", "error", "catalog_service_nodes"},
Help: "Increments whenever a Consul agent receives an RPC error for a request to list nodes offering a service.",
},
{
Name: []string{"client", "api", "success", "catalog_service_nodes"},
Help: "Increments whenever a Consul agent successfully responds to a request to list nodes offering a service.",
},
{
Name: []string{"client", "api", "error", "catalog_service_nodes"},
Help: "",
},
{
Name: []string{"client", "api", "catalog_node_services"},
Help: "Increments whenever a Consul agent successfully responds to a request to list nodes offering a service.",
},
{
Name: []string{"client", "api", "success", "catalog_node_services"},
Help: "Increments whenever a Consul agent successfully responds to a request to list services in a node.",
},
{
Name: []string{"client", "rpc", "error", "catalog_node_services"},
Help: "Increments whenever a Consul agent receives an RPC error for a request to list services in a node.",
},
{
Name: []string{"client", "api", "catalog_node_service_list"},
Help: "",
},
{
Name: []string{"client", "rpc", "error", "catalog_node_service_list"},
Help: "",
},
{
Name: []string{"client", "api", "success", "catalog_node_service_list"},
Help: "",
},
{
Name: []string{"client", "api", "catalog_gateway_services"},
Help: "Increments whenever a Consul agent receives a request to list services associated with a gateway.",
},
{
Name: []string{"client", "rpc", "error", "catalog_gateway_services"},
Help: "Increments whenever a Consul agent receives an RPC error for a request to list services associated with a gateway.",
},
{
Name: []string{"client", "api", "success", "catalog_gateway_services"},
Help: "Increments whenever a Consul agent successfully responds to a request to list services associated with a gateway.",
},
}
func (s *HTTPHandlers) CatalogRegister(resp http.ResponseWriter, req *http.Request) (interface{}, error) { func (s *HTTPHandlers) CatalogRegister(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
metrics.IncrCounterWithLabels([]string{"client", "api", "catalog_register"}, 1, metrics.IncrCounterWithLabels([]string{"client", "api", "catalog_register"}, 1,
[]metrics.Label{{Name: "node", Value: s.nodeName()}}) []metrics.Label{{Name: "node", Value: s.nodeName()}})

View File

@ -17,6 +17,7 @@ import (
"strings" "strings"
"time" "time"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/go-bexpr" "github.com/hashicorp/go-bexpr"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-multierror" "github.com/hashicorp/go-multierror"
@ -942,13 +943,15 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
DisableHostname: b.boolVal(c.Telemetry.DisableHostname), DisableHostname: b.boolVal(c.Telemetry.DisableHostname),
DogstatsdAddr: b.stringVal(c.Telemetry.DogstatsdAddr), DogstatsdAddr: b.stringVal(c.Telemetry.DogstatsdAddr),
DogstatsdTags: c.Telemetry.DogstatsdTags, DogstatsdTags: c.Telemetry.DogstatsdTags,
PrometheusRetentionTime: b.durationVal("prometheus_retention_time", c.Telemetry.PrometheusRetentionTime),
FilterDefault: b.boolVal(c.Telemetry.FilterDefault), FilterDefault: b.boolVal(c.Telemetry.FilterDefault),
AllowedPrefixes: telemetryAllowedPrefixes, AllowedPrefixes: telemetryAllowedPrefixes,
BlockedPrefixes: telemetryBlockedPrefixes, BlockedPrefixes: telemetryBlockedPrefixes,
MetricsPrefix: b.stringVal(c.Telemetry.MetricsPrefix), MetricsPrefix: b.stringVal(c.Telemetry.MetricsPrefix),
StatsdAddr: b.stringVal(c.Telemetry.StatsdAddr), StatsdAddr: b.stringVal(c.Telemetry.StatsdAddr),
StatsiteAddr: b.stringVal(c.Telemetry.StatsiteAddr), StatsiteAddr: b.stringVal(c.Telemetry.StatsiteAddr),
PrometheusOpts: prometheus.PrometheusOpts{
Expiration: b.durationVal("prometheus_retention_time", c.Telemetry.PrometheusRetentionTime),
},
}, },
// Agent // Agent

View File

@ -18,6 +18,7 @@ import (
"testing" "testing"
"time" "time"
"github.com/armon/go-metrics/prometheus"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/cache"
@ -7103,9 +7104,11 @@ func TestFullConfig(t *testing.T) {
AllowedPrefixes: []string{"oJotS8XJ"}, AllowedPrefixes: []string{"oJotS8XJ"},
BlockedPrefixes: []string{"cazlEhGn"}, BlockedPrefixes: []string{"cazlEhGn"},
MetricsPrefix: "ftO6DySn", MetricsPrefix: "ftO6DySn",
PrometheusRetentionTime: 15 * time.Second,
StatsdAddr: "drce87cy", StatsdAddr: "drce87cy",
StatsiteAddr: "HpFwKB8R", StatsiteAddr: "HpFwKB8R",
PrometheusOpts: prometheus.PrometheusOpts{
Expiration: 15 * time.Second,
},
}, },
TLSCipherSuites: []uint16{tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256}, TLSCipherSuites: []uint16{tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256},
TLSMinVersion: "pAOWafkR", TLSMinVersion: "pAOWafkR",
@ -7814,9 +7817,15 @@ func TestSanitize(t *testing.T) {
"DogstatsdTags": [], "DogstatsdTags": [],
"FilterDefault": false, "FilterDefault": false,
"MetricsPrefix": "", "MetricsPrefix": "",
"PrometheusRetentionTime": "0s",
"StatsdAddr": "", "StatsdAddr": "",
"StatsiteAddr": "" "StatsiteAddr": "",
"PrometheusOpts": {
"Expiration": "0s",
"Registerer": null,
"GaugeDefinitions": [],
"CounterDefinitions": [],
"SummaryDefinitions": []
}
}, },
"TranslateWANAddrs": false, "TranslateWANAddrs": false,
"TxnMaxReqLen": 5678000000000000, "TxnMaxReqLen": 5678000000000000,

View File

@ -6,7 +6,8 @@ import (
"sync" "sync"
"time" "time"
metrics "github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging"
@ -15,6 +16,32 @@ import (
"golang.org/x/time/rate" "golang.org/x/time/rate"
) )
var ACLCounters = []prometheus.CounterDefinition{
{
Name: []string{"acl", "token", "cache_hit"},
Help: "",
},
{
Name: []string{"acl", "token", "cache_miss"},
Help: "",
},
}
var ACLSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"acl", "resolveTokenLegacy"},
Help: "",
},
{
Name: []string{"acl", "ResolveToken"},
Help: "",
},
{
Name: []string{"acl", "ResolveTokenToIdentity"},
Help: "",
},
}
// These must be kept in sync with the constants in command/agent/acl.go. // These must be kept in sync with the constants in command/agent/acl.go.
const ( const (
// anonymousToken is the token ID we re-write to if there is no token ID // anonymousToken is the token ID we re-write to if there is no token ID

View File

@ -11,7 +11,8 @@ import (
"regexp" "regexp"
"time" "time"
metrics "github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/consul/authmethod" "github.com/hashicorp/consul/agent/consul/authmethod"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
@ -30,6 +31,73 @@ const (
aclBootstrapReset = "acl-bootstrap-reset" aclBootstrapReset = "acl-bootstrap-reset"
) )
var ACLEndpointSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"acl", "token", "clone"},
Help: "",
},
{
Name: []string{"acl", "token", "upsert"},
Help: "",
},
{
Name: []string{"acl", "token", "delete"},
Help: "",
},
{
Name: []string{"acl", "policy", "upsert"},
Help: "",
},
{
Name: []string{"acl", "policy", "delete"},
Help: "",
},
{
Name: []string{"acl", "policy", "delete"},
Help: "",
},
{
Name: []string{"acl", "role", "upsert"},
Help: "",
},
{
Name: []string{"acl", "role", "delete"},
Help: "",
},
{
Name: []string{"acl", "bindingrule", "upsert"},
Help: "",
},
{
Name: []string{"acl", "bindingrule", "delete"},
Help: "",
},
{
Name: []string{"acl", "authmethod", "upsert"},
Help: "",
},
{
Name: []string{"acl", "authmethod", "delete"},
Help: "",
},
{
Name: []string{"acl", "login"},
Help: "",
},
{
Name: []string{"acl", "login"},
Help: "",
},
{
Name: []string{"acl", "logout"},
Help: "",
},
{
Name: []string{"acl", "logout"},
Help: "",
},
}
// Regex for matching // Regex for matching
var ( var (
validPolicyName = regexp.MustCompile(`^[A-Za-z0-9\-_]{1,128}$`) validPolicyName = regexp.MustCompile(`^[A-Za-z0-9\-_]{1,128}$`)

View File

@ -5,6 +5,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
@ -12,6 +13,13 @@ import (
"github.com/hashicorp/go-memdb" "github.com/hashicorp/go-memdb"
) )
var ACLEndpointLegacySummaries = []prometheus.SummaryDefinition{
{
Name: []string{"acl", "apply"},
Help: "Measures the time it takes to complete an update to the ACL store.",
},
}
// Bootstrap is used to perform a one-time ACL bootstrap operation on // Bootstrap is used to perform a one-time ACL bootstrap operation on
// a cluster to get the first management token. // a cluster to get the first management token.
func (a *ACL) Bootstrap(args *structs.DCSpecificRequest, reply *structs.ACL) error { func (a *ACL) Bootstrap(args *structs.DCSpecificRequest, reply *structs.ACL) error {

View File

@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/metadata"
"github.com/hashicorp/consul/types" "github.com/hashicorp/consul/types"
"github.com/hashicorp/raft" "github.com/hashicorp/raft"
@ -12,6 +13,17 @@ import (
"github.com/hashicorp/serf/serf" "github.com/hashicorp/serf/serf"
) )
var AutopilotGauges = []prometheus.GaugeDefinition{
{
Name: []string{"autopilot", "failure_tolerance"},
Help: "Tracks the number of voting servers that the cluster can lose while continuing to function.",
},
{
Name: []string{"autopilot", "healthy"},
Help: "Tracks the overall health of the local server cluster. 1 if all servers are healthy, 0 if one or more are unhealthy.",
},
}
// AutopilotDelegate is a Consul delegate for autopilot operations. // AutopilotDelegate is a Consul delegate for autopilot operations.
type AutopilotDelegate struct { type AutopilotDelegate struct {
server *Server server *Server

View File

@ -6,6 +6,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
@ -17,6 +18,52 @@ import (
"github.com/hashicorp/go-uuid" "github.com/hashicorp/go-uuid"
) )
var CatalogCounters = []prometheus.CounterDefinition{
{
Name: []string{"catalog", "service", "query"},
Help: "Increments for each catalog query for the given service.",
},
{
Name: []string{"catalog", "connect", "query"},
Help: "",
},
{
Name: []string{"catalog", "service", "query-tag"},
Help: "Increments for each catalog query for the given service with the given tag.",
},
{
Name: []string{"catalog", "connect", "query-tag"},
Help: "",
},
{
Name: []string{"catalog", "service", "query-tags"},
Help: "Increments for each catalog query for the given service with the given tags.",
},
{
Name: []string{"catalog", "connect", "query-tags"},
Help: "",
},
{
Name: []string{"catalog", "service", "not-found"},
Help: "Increments for each catalog query where the given service could not be found.",
},
{
Name: []string{"catalog", "connect", "not-found"},
Help: "",
},
}
var CatalogSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"catalog", "deregister"},
Help: "Measures the time it takes to complete a catalog deregister operation.",
},
{
Name: []string{"catalog", "register"},
Help: "Measures the time it takes to complete a catalog register operation.",
},
}
// Catalog endpoint is used to manipulate the service catalog // Catalog endpoint is used to manipulate the service catalog
type Catalog struct { type Catalog struct {
srv *Server srv *Server

View File

@ -9,6 +9,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/pool" "github.com/hashicorp/consul/agent/pool"
"github.com/hashicorp/consul/agent/router" "github.com/hashicorp/consul/agent/router"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
@ -21,6 +22,21 @@ import (
"golang.org/x/time/rate" "golang.org/x/time/rate"
) )
var ClientCounters = []prometheus.CounterDefinition{
{
Name: []string{"client", "rpc"},
Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server.",
},
{
Name: []string{"client", "rpc", "exceeded"},
Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server gets rate limited by that agent's limits configuration.",
},
{
Name: []string{"client", "rpc", "failed"},
Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server and fails.",
},
}
const ( const (
// serfEventBacklog is the maximum number of unprocessed Serf Events // serfEventBacklog is the maximum number of unprocessed Serf Events
// that will be held in queue before new serf events block. A // that will be held in queue before new serf events block. A

View File

@ -4,6 +4,8 @@ import (
"fmt" "fmt"
"time" "time"
"github.com/armon/go-metrics/prometheus"
metrics "github.com/armon/go-metrics" metrics "github.com/armon/go-metrics"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
@ -12,6 +14,33 @@ import (
"github.com/mitchellh/copystructure" "github.com/mitchellh/copystructure"
) )
var ConfigSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"config_entry", "apply"},
Help: "",
},
{
Name: []string{"config_entry", "get"},
Help: "",
},
{
Name: []string{"config_entry", "list"},
Help: "",
},
{
Name: []string{"config_entry", "listAll"},
Help: "",
},
{
Name: []string{"config_entry", "delete"},
Help: "",
},
{
Name: []string{"config_entry", "resolve_service_config"},
Help: "",
},
}
// The ConfigEntry endpoint is used to query centralized config information // The ConfigEntry endpoint is used to query centralized config information
type ConfigEntry struct { type ConfigEntry struct {
srv *Server srv *Server

View File

@ -5,13 +5,33 @@ import (
"fmt" "fmt"
"time" "time"
metrics "github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
memdb "github.com/hashicorp/go-memdb" memdb "github.com/hashicorp/go-memdb"
) )
var FederationStateSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"federation_state", "apply"},
Help: "",
},
{
Name: []string{"federation_state", "get"},
Help: "",
},
{
Name: []string{"federation_state", "list"},
Help: "",
},
{
Name: []string{"federation_state", "list_mesh_gateways"},
Help: "",
},
}
var ( var (
errFederationStatesNotEnabled = errors.New("Federation states are currently disabled until all servers in the datacenter support the feature") errFederationStatesNotEnabled = errors.New("Federation states are currently disabled until all servers in the datacenter support the feature")
) )

View File

@ -4,11 +4,102 @@ import (
"fmt" "fmt"
"time" "time"
metrics "github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/api" "github.com/hashicorp/consul/api"
) )
var CommandsSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"fsm", "register"},
Help: "Measures the time it takes to apply a catalog register operation to the FSM.",
},
{
Name: []string{"fsm", "deregister"},
Help: "Measures the time it takes to apply a catalog deregister operation to the FSM.",
},
{
Name: []string{"fsm", "kvs"},
Help: "Measures the time it takes to apply the given KV operation to the FSM.",
},
{
Name: []string{"fsm", "session"},
Help: "Measures the time it takes to apply the given session operation to the FSM.",
},
{
Name: []string{"fsm", "acl"},
Help: "Measures the time it takes to apply the given ACL operation to the FSM.",
},
{
Name: []string{"fsm", "tombstone"},
Help: "Measures the time it takes to apply the given tombstone operation to the FSM.",
},
{
Name: []string{"fsm", "coordinate", "batch-update"},
Help: "Measures the time it takes to apply the given batch coordinate update to the FSM.",
},
{
Name: []string{"fsm", "prepared-query"},
Help: "Measures the time it takes to apply the given prepared query update operation to the FSM.",
},
{
Name: []string{"fsm", "txn"},
Help: "Measures the time it takes to apply the given transaction update to the FSM.",
},
{
Name: []string{"fsm", "autopilot"},
Help: "Measures the time it takes to apply the given autopilot update to the FSM.",
},
{
Name: []string{"consul", "fsm", "intention"},
Help: "",
},
{
Name: []string{"fsm", "intention"},
Help: "",
},
{
Name: []string{"consul", "fsm", "ca"},
Help: "",
},
{
Name: []string{"fsm", "ca", "leaf"},
Help: "",
},
{
Name: []string{"fsm", "acl", "token"},
Help: "",
},
{
Name: []string{"fsm", "ca", "leaf"},
Help: "",
},
{
Name: []string{"fsm", "acl", "policy"},
Help: "",
},
{
Name: []string{"fsm", "acl", "bindingrule"},
Help: "",
},
{
Name: []string{"fsm", "acl", "authmethod"},
Help: "",
},
{
Name: []string{"fsm", "system_metadata"},
Help: "",
},
// TODO(kit): We generate the config-entry fsm summaries by reading off of the request. It is
// possible to statically declare these when we know all of the names, but I didn't get to it
// in this patch. Config-entries are known though and we should add these in the future.
// {
// Name: []string{"fsm", "config_entry", req.Entry.GetKind()},
// Help: "",
// },
}
func init() { func init() {
registerCommand(structs.RegisterRequestType, (*FSM).applyRegister) registerCommand(structs.RegisterRequestType, (*FSM).applyRegister)
registerCommand(structs.DeregisterRequestType, (*FSM).applyDeregister) registerCommand(structs.DeregisterRequestType, (*FSM).applyDeregister)

View File

@ -5,6 +5,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/go-msgpack/codec" "github.com/hashicorp/go-msgpack/codec"
@ -12,6 +13,13 @@ import (
"github.com/hashicorp/raft" "github.com/hashicorp/raft"
) )
var SnapshotSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"fsm", "persist"},
Help: "Measures the time it takes to persist the FSM to a raft snapshot.",
},
}
// snapshot is used to provide a snapshot of the current // snapshot is used to provide a snapshot of the current
// state in a way that can be accessed concurrently with operations // state in a way that can be accessed concurrently with operations
// that may modify the live state. // that may modify the live state.

View File

@ -6,6 +6,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
@ -16,6 +17,17 @@ import (
"github.com/hashicorp/go-memdb" "github.com/hashicorp/go-memdb"
) )
var IntentionSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"consul", "intention", "apply"},
Help: "",
},
{
Name: []string{"intention", "apply"},
Help: "",
},
}
var ( var (
// ErrIntentionNotFound is returned if the intention lookup failed. // ErrIntentionNotFound is returned if the intention lookup failed.
ErrIntentionNotFound = errors.New("Intention not found") ErrIntentionNotFound = errors.New("Intention not found")

View File

@ -6,6 +6,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
@ -14,6 +15,13 @@ import (
"github.com/hashicorp/go-memdb" "github.com/hashicorp/go-memdb"
) )
var KVSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"kvs", "apply"},
Help: "Measures the time it takes to complete an update to the KV store.",
},
}
// KVS endpoint is used to manipulate the Key-Value store // KVS endpoint is used to manipulate the Key-Value store
type KVS struct { type KVS struct {
srv *Server srv *Server

View File

@ -11,6 +11,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/metadata"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
@ -27,6 +28,21 @@ import (
"golang.org/x/time/rate" "golang.org/x/time/rate"
) )
var LeaderSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"leader", "barrier"},
Help: "Measures the time spent waiting for the raft barrier upon gaining leadership.",
},
{
Name: []string{"leader", "reconcileMember"},
Help: "Measures the time spent updating the raft store for a single serf member's information.",
},
{
Name: []string{"leader", "reapTombstones"},
Help: "Measures the time spent clearing tombstones.",
},
}
const ( const (
newLeaderEvent = "consul:new-leader" newLeaderEvent = "consul:new-leader"
barrierWriteTimeout = 2 * time.Minute barrierWriteTimeout = 2 * time.Minute

View File

@ -6,6 +6,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
@ -15,6 +16,25 @@ import (
"github.com/hashicorp/go-uuid" "github.com/hashicorp/go-uuid"
) )
var PreparedQuerySummaries = []prometheus.SummaryDefinition{
{
Name: []string{"prepared-query", "apply"},
Help: "Measures the time it takes to apply a prepared query update.",
},
{
Name: []string{"prepared-query", "explain"},
Help: "Measures the time it takes to process a prepared query explain request.",
},
{
Name: []string{"prepared-query", "execute"},
Help: "Measures the time it takes to process a prepared query execute request.",
},
{
Name: []string{"prepared-query", "execute_remote"},
Help: "Measures the time it takes to process a prepared query execute request that was forwarded to another datacenter.",
},
}
// PreparedQuery manages the prepared query endpoint. // PreparedQuery manages the prepared query endpoint.
type PreparedQuery struct { type PreparedQuery struct {
srv *Server srv *Server

View File

@ -13,6 +13,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/consul/wanfed" "github.com/hashicorp/consul/agent/consul/wanfed"
@ -31,6 +32,47 @@ import (
"github.com/hashicorp/yamux" "github.com/hashicorp/yamux"
) )
var RPCCounters = []prometheus.CounterDefinition{
{
Name: []string{"rpc", "accept_conn"},
Help: "Increments when a server accepts an RPC connection.",
},
{
Name: []string{"rpc", "raft_handoff"},
Help: "Increments when a server accepts a Raft-related RPC connection.",
},
{
Name: []string{"rpc", "request_error"},
Help: "Increments when a server returns an error from an RPC request.",
},
{
Name: []string{"rpc", "request"},
Help: "Increments when a server receives a Consul-related RPC request.",
},
{
Name: []string{"rpc", "cross-dc"},
Help: "Increments when a server sends a (potentially blocking) cross datacenter RPC query.",
},
{
Name: []string{"rpc", "query"},
Help: "Increments when a server receives a new blocking RPC request, indicating the rate of new blocking query calls.",
},
}
var RPCGauges = []prometheus.GaugeDefinition{
{
Name: []string{"rpc", "queries_blocking"},
Help: "Shows the current number of in-flight blocking queries the server is handling.",
},
}
var RPCSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"rpc", "consistentRead"},
Help: "Measures the time spent confirming that a consistent read can be performed.",
},
}
const ( const (
// jitterFraction is a the limit to the amount of jitter we apply // jitterFraction is a the limit to the amount of jitter we apply
// to a user specified MaxQueryTime. We divide the specified time by // to a user specified MaxQueryTime. We divide the specified time by

View File

@ -7,10 +7,18 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/serf/serf" "github.com/hashicorp/serf/serf"
) )
var SegmentOSSSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"leader", "reconcile"},
Help: "Measures the time spent updating the raft store from the serf member information.",
},
}
// LANMembersAllSegments returns members from all segments. // LANMembersAllSegments returns members from all segments.
func (s *Server) LANMembersAllSegments() ([]serf.Member, error) { func (s *Server) LANMembersAllSegments() ([]serf.Member, error) {
return s.LANMembers(), nil return s.LANMembers(), nil

View File

@ -17,7 +17,7 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
metrics "github.com/armon/go-metrics" "github.com/armon/go-metrics"
connlimit "github.com/hashicorp/go-connlimit" connlimit "github.com/hashicorp/go-connlimit"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-memdb" "github.com/hashicorp/go-memdb"
@ -50,6 +50,8 @@ import (
"github.com/hashicorp/consul/types" "github.com/hashicorp/consul/types"
) )
// NOTE The "consul.client.rpc" and "consul.client.rpc.exceeded" counters are defined in consul/client.go
// These are the protocol versions that Consul can _understand_. These are // These are the protocol versions that Consul can _understand_. These are
// Consul-level protocol versions, that are used to configure the Serf // Consul-level protocol versions, that are used to configure the Serf
// protocol versions. // protocol versions.

View File

@ -5,6 +5,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
@ -13,6 +14,17 @@ import (
"github.com/hashicorp/go-uuid" "github.com/hashicorp/go-uuid"
) )
var SessionEndpointSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"session", "apply"},
Help: "Measures the time spent applying a session update.",
},
{
Name: []string{"session", "renew"},
Help: "Measures the time spent renewing a session.",
},
}
// Session endpoint is used to manipulate sessions for KV // Session endpoint is used to manipulate sessions for KV
type Session struct { type Session struct {
srv *Server srv *Server

View File

@ -5,9 +5,32 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
) )
var SessionGauges = []prometheus.GaugeDefinition{
{
Name: []string{"session_ttl", "active"},
Help: "Tracks the active number of sessions being tracked.",
},
{
Name: []string{"raft", "applied_index"},
Help: "",
},
{
Name: []string{"raft", "last_index"},
Help: "",
},
}
var SessionSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"session_ttl", "invalidate"},
Help: "Measures the time spent invalidating an expired session.",
},
}
const ( const (
// maxInvalidateAttempts limits how many invalidate attempts are made // maxInvalidateAttempts limits how many invalidate attempts are made
maxInvalidateAttempts = 6 maxInvalidateAttempts = 6

View File

@ -5,12 +5,24 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/api" "github.com/hashicorp/consul/api"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
) )
var TxnSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"txn", "apply"},
Help: "Measures the time spent applying a transaction operation.",
},
{
Name: []string{"txn", "read"},
Help: "Measures the time spent returning a read transaction.",
},
}
// Txn endpoint is used to perform multi-object atomic transactions. // Txn endpoint is used to perform multi-object atomic transactions.
type Txn struct { type Txn struct {
srv *Server srv *Server

View File

@ -5,12 +5,29 @@ import (
"errors" "errors"
"time" "time"
"github.com/armon/go-metrics/prometheus"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
) )
var Gauges = []prometheus.GaugeDefinition{
{
Name: []string{"consul", "state", "nodes"},
Help: "Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0.",
},
{
Name: []string{"consul", "state", "services"},
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
},
{
Name: []string{"consul", "state", "service_instances"},
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
},
}
// Config holds the settings for various parameters for the // Config holds the settings for various parameters for the
// UsageMetricsReporter // UsageMetricsReporter
type Config struct { type Config struct {

View File

@ -10,6 +10,8 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
"github.com/armon/go-metrics/prometheus"
metrics "github.com/armon/go-metrics" metrics "github.com/armon/go-metrics"
radix "github.com/armon/go-radix" radix "github.com/armon/go-radix"
"github.com/coredns/coredns/plugin/pkg/dnsutil" "github.com/coredns/coredns/plugin/pkg/dnsutil"
@ -26,6 +28,24 @@ import (
"github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging"
) )
var DNSCounters = []prometheus.CounterDefinition{
{
Name: []string{"dns", "stale_queries"},
Help: "Increments when an agent serves a query within the allowed stale threshold.",
},
}
var DNSSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"dns", "ptr_query"},
Help: "Measures the time spent handling a reverse DNS query for the given node.",
},
{
Name: []string{"dns", "domain_query"},
Help: "Measures the time spent handling a domain query for the given node.",
},
}
const ( const (
// UDP can fit ~25 A records in a 512B response, and ~14 AAAA // UDP can fit ~25 A records in a 512B response, and ~14 AAAA
// records. Limit further to prevent unintentional configuration // records. Limit further to prevent unintentional configuration

View File

@ -5,10 +5,48 @@ import (
"sync/atomic" "sync/atomic"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"google.golang.org/grpc" "google.golang.org/grpc"
"google.golang.org/grpc/stats" "google.golang.org/grpc/stats"
) )
var StatsGauges = []prometheus.GaugeDefinition{
{
Name: []string{"grpc", "server", "connections"},
Help: "Measures the number of active gRPC connections open on the server.",
},
{
Name: []string{"grpc", "client", "connections"},
Help: "Measures the number of active gRPC connections open from the client agent to any Consul servers.",
},
{
Name: []string{"grpc", "server", "streams"},
Help: "Measures the number of active gRPC streams handled by the server.",
},
}
var StatsCounters = []prometheus.CounterDefinition{
{
Name: []string{"grpc", "client", "request", "count"},
Help: "Counts the number of gRPC requests made by the client agent to a Consul server.",
},
{
Name: []string{"grpc", "server", "request", "count"},
Help: "Counts the number of gRPC requests received by the server.",
},
{
Name: []string{"grpc", "client", "connection", "count"},
Help: "Counts the number of new gRPC connections opened by the client agent to a Consul server.",
},
{
Name: []string{"grpc", "server", "connection", "count"},
Help: "Counts the number of new gRPC connections received by the server.",
},
{
Name: []string{"grpc", "server", "stream", "count"},
Help: "Counts the number of new gRPC streams received by the server.",
},
}
var defaultMetrics = metrics.Default var defaultMetrics = metrics.Default
// statsHandler is a grpc/stats.StatsHandler which emits connection and // statsHandler is a grpc/stats.StatsHandler which emits connection and

View File

@ -17,6 +17,7 @@ import (
"github.com/NYTimes/gziphandler" "github.com/NYTimes/gziphandler"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/config"
@ -31,6 +32,13 @@ import (
"github.com/pkg/errors" "github.com/pkg/errors"
) )
var HTTPSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"api", "http"},
Help: "Samples how long it takes to service the given HTTP request for the given verb and path.",
},
}
// MethodNotAllowedError should be returned by a handler when the HTTP method is not allowed. // MethodNotAllowedError should be returned by a handler when the HTTP method is not allowed.
type MethodNotAllowedError struct { type MethodNotAllowedError struct {
Method string Method string

View File

@ -9,8 +9,8 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
metrics "github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/agent/token"
@ -20,6 +20,33 @@ import (
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
) )
var StateCounters = []prometheus.CounterDefinition{
{
Name: []string{"acl", "blocked", "service", "registration"},
Help: "Increments whenever a registration fails for a service (blocked by an ACL)",
},
{
Name: []string{"acl", "blocked", "service", "deregistration"},
Help: "Increments whenever a deregistration fails for a service (blocked by an ACL)",
},
{
Name: []string{"acl", "blocked", "check", "registration"},
Help: "Increments whenever a registration fails for a check (blocked by an ACL)",
},
{
Name: []string{"acl", "blocked", "check", "deregistration"},
Help: "Increments whenever a deregistration fails for a check (blocked by an ACL)",
},
{
Name: []string{"acl", "blocked", "node", "registration"},
Help: "Increments whenever a registration fails for a node (blocked by an ACL)",
},
{
Name: []string{"acl", "blocked", "node", "deregistration"},
Help: "Increments whenever a deregistration fails for a node (blocked by an ACL)",
},
}
const fullSyncReadMaxStale = 2 * time.Second const fullSyncReadMaxStale = 2 * time.Second
// Config is the configuration for the State. // Config is the configuration for the State.

View File

@ -8,6 +8,12 @@ import (
"sync" "sync"
"time" "time"
"github.com/hashicorp/consul/agent/consul/fsm"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/consul/usagemetrics"
"github.com/hashicorp/consul/agent/local"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
"google.golang.org/grpc/grpclog" "google.golang.org/grpc/grpclog"
grpcresolver "google.golang.org/grpc/resolver" grpcresolver "google.golang.org/grpc/resolver"
@ -72,6 +78,10 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer) (BaseDeps, error)
return d, fmt.Errorf("failed to setup node ID: %w", err) return d, fmt.Errorf("failed to setup node ID: %w", err)
} }
gauges, counters, summaries := getPrometheusDefs(cfg.Telemetry)
cfg.Telemetry.PrometheusOpts.GaugeDefinitions = gauges
cfg.Telemetry.PrometheusOpts.CounterDefinitions = counters
cfg.Telemetry.PrometheusOpts.SummaryDefinitions = summaries
d.MetricsHandler, err = lib.InitTelemetry(cfg.Telemetry) d.MetricsHandler, err = lib.InitTelemetry(cfg.Telemetry)
if err != nil { if err != nil {
return d, fmt.Errorf("failed to initialize telemetry: %w", err) return d, fmt.Errorf("failed to initialize telemetry: %w", err)
@ -177,3 +187,119 @@ func registerWithGRPC(b grpcresolver.Builder) {
defer registerLock.Unlock() defer registerLock.Unlock()
grpcresolver.Register(b) grpcresolver.Register(b)
} }
// getPrometheusDefs reaches into every slice of prometheus defs we've defined in each part of the agent, and appends
// all of our slices into one nice slice of definitions per metric type for the Consul agent to pass to go-metrics.
func getPrometheusDefs(cfg lib.TelemetryConfig) ([]prometheus.GaugeDefinition, []prometheus.CounterDefinition, []prometheus.SummaryDefinition) {
// Build slice of slices for all gauge definitions
var gauges = [][]prometheus.GaugeDefinition{
cache.Gauges,
consul.AutopilotGauges,
consul.RPCGauges,
consul.SessionGauges,
grpc.StatsGauges,
usagemetrics.Gauges,
}
// Flatten definitions
// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
var gaugeDefs []prometheus.GaugeDefinition
for _, g := range gauges {
// Set Consul to each definition's namespace
// TODO(kit): Prepending the service to each definition should be handled by go-metrics
var withService []prometheus.GaugeDefinition
for _, gauge := range g {
gauge.Name = append([]string{cfg.MetricsPrefix}, gauge.Name...)
withService = append(withService, gauge)
}
gaugeDefs = append(gaugeDefs, withService...)
}
raftCounters := []prometheus.CounterDefinition{
// TODO(kit): "raft..." metrics come from the raft lib and we should migrate these to a telemetry
// package within. In the mean time, we're going to define a few here because they're key to monitoring Consul.
{
Name: []string{"raft", "apply"},
Help: "This counts the number of Raft transactions occurring over the interval.",
},
{
Name: []string{"raft", "state", "candidate"},
Help: "This increments whenever a Consul server starts an election.",
},
{
Name: []string{"raft", "state", "leader"},
Help: "This increments whenever a Consul server becomes a leader.",
},
}
var counters = [][]prometheus.CounterDefinition{
CatalogCounters,
cache.Counters,
consul.ACLCounters,
consul.CatalogCounters,
consul.ClientCounters,
consul.RPCCounters,
grpc.StatsCounters,
local.StateCounters,
raftCounters,
}
// Flatten definitions
// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
var counterDefs []prometheus.CounterDefinition
for _, c := range counters {
// TODO(kit): Prepending the service to each definition should be handled by go-metrics
var withService []prometheus.CounterDefinition
for _, counter := range c {
counter.Name = append([]string{cfg.MetricsPrefix}, counter.Name...)
withService = append(withService, counter)
}
counterDefs = append(counterDefs, withService...)
}
raftSummaries := []prometheus.SummaryDefinition{
// TODO(kit): "raft..." metrics come from the raft lib and we should migrate these to a telemetry
// package within. In the mean time, we're going to define a few here because they're key to monitoring Consul.
{
Name: []string{"raft", "commitTime"},
Help: "This measures the time it takes to commit a new entry to the Raft log on the leader.",
},
{
Name: []string{"raft", "leader", "lastContact"},
Help: "Measures the time since the leader was last able to contact the follower nodes when checking its leader lease.",
},
}
var summaries = [][]prometheus.SummaryDefinition{
HTTPSummaries,
consul.ACLSummaries,
consul.ACLEndpointSummaries,
consul.ACLEndpointLegacySummaries,
consul.CatalogSummaries,
consul.FederationStateSummaries,
consul.IntentionSummaries,
consul.KVSummaries,
consul.LeaderSummaries,
consul.PreparedQuerySummaries,
consul.RPCSummaries,
consul.SegmentOSSSummaries,
consul.SessionSummaries,
consul.SessionEndpointSummaries,
consul.TxnSummaries,
fsm.CommandsSummaries,
fsm.SnapshotSummaries,
raftSummaries,
}
// Flatten definitions
// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
var summaryDefs []prometheus.SummaryDefinition
for _, s := range summaries {
// TODO(kit): Prepending the service to each definition should be handled by go-metrics
var withService []prometheus.SummaryDefinition
for _, summary := range s {
summary.Name = append([]string{cfg.MetricsPrefix}, summary.Name...)
withService = append(withService, summary)
}
summaryDefs = append(summaryDefs, withService...)
}
return gaugeDefs, counterDefs, summaryDefs
}

View File

@ -54,6 +54,8 @@ func (p *Proxy) Serve() error {
// Initial setup // Initial setup
// Setup telemetry if configured // Setup telemetry if configured
// NOTE(kit): As far as I can tell, all of the metrics in the proxy are generated at runtime, so we
// don't have any static metrics we initialize at start.
_, err := lib.InitTelemetry(newCfg.Telemetry) _, err := lib.InitTelemetry(newCfg.Telemetry)
if err != nil { if err != nil {
p.logger.Error("proxy telemetry config error", "error", err) p.logger.Error("proxy telemetry config error", "error", err)

View File

@ -4,7 +4,7 @@ import (
"reflect" "reflect"
"time" "time"
metrics "github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/circonus" "github.com/armon/go-metrics/circonus"
"github.com/armon/go-metrics/datadog" "github.com/armon/go-metrics/datadog"
"github.com/armon/go-metrics/prometheus" "github.com/armon/go-metrics/prometheus"
@ -154,14 +154,6 @@ type TelemetryConfig struct {
// hcl: telemetry { dogstatsd_tags = []string } // hcl: telemetry { dogstatsd_tags = []string }
DogstatsdTags []string `json:"dogstatsd_tags,omitempty" mapstructure:"dogstatsd_tags"` DogstatsdTags []string `json:"dogstatsd_tags,omitempty" mapstructure:"dogstatsd_tags"`
// PrometheusRetentionTime is the retention time for prometheus metrics if greater than 0.
// A value of 0 disable Prometheus support. Regarding Prometheus, it is considered a good
// practice to put large values here (such as a few days), and at least the interval between
// prometheus requests.
//
// hcl: telemetry { prometheus_retention_time = "duration" }
PrometheusRetentionTime time.Duration `json:"prometheus_retention_time,omitempty" mapstructure:"prometheus_retention_time"`
// FilterDefault is the default for whether to allow a metric that's not // FilterDefault is the default for whether to allow a metric that's not
// covered by the filter. // covered by the filter.
// //
@ -199,10 +191,18 @@ type TelemetryConfig struct {
// //
// hcl: telemetry { statsite_address = string } // hcl: telemetry { statsite_address = string }
StatsiteAddr string `json:"statsite_address,omitempty" mapstructure:"statsite_address"` StatsiteAddr string `json:"statsite_address,omitempty" mapstructure:"statsite_address"`
// PrometheusOpts provides configuration for the PrometheusSink. Currently the only configuration
// we acquire from hcl is the retention time. We also use definition slices that are set in agent setup
// before being passed to InitTelemmetry.
//
// hcl: telemetry { prometheus_retention_time = "duration" }
PrometheusOpts prometheus.PrometheusOpts
} }
// MergeDefaults copies any non-zero field from defaults into the current // MergeDefaults copies any non-zero field from defaults into the current
// config. // config.
// TODO(kit): We no longer use this function and can probably delete it
func (c *TelemetryConfig) MergeDefaults(defaults *TelemetryConfig) { func (c *TelemetryConfig) MergeDefaults(defaults *TelemetryConfig) {
if defaults == nil { if defaults == nil {
return return
@ -221,6 +221,10 @@ func (c *TelemetryConfig) MergeDefaults(defaults *TelemetryConfig) {
// implementing this for the types we actually have for now. Test failure // implementing this for the types we actually have for now. Test failure
// should catch the case where we add new types later. // should catch the case where we add new types later.
switch f.Kind() { switch f.Kind() {
case reflect.Struct:
if f.Type() == reflect.TypeOf(prometheus.PrometheusOpts{}) {
continue
}
case reflect.Slice: case reflect.Slice:
if !f.IsNil() { if !f.IsNil() {
continue continue
@ -277,80 +281,12 @@ func dogstatdSink(cfg TelemetryConfig, hostname string) (metrics.MetricSink, err
} }
func prometheusSink(cfg TelemetryConfig, hostname string) (metrics.MetricSink, error) { func prometheusSink(cfg TelemetryConfig, hostname string) (metrics.MetricSink, error) {
if cfg.PrometheusRetentionTime.Nanoseconds() < 1 {
if cfg.PrometheusOpts.Expiration.Nanoseconds() < 1 {
return nil, nil return nil, nil
} }
// TODO(kit) define these in vars in the package/file they're used sink, err := prometheus.NewPrometheusSinkFrom(cfg.PrometheusOpts)
gaugeDefs := []prometheus.GaugeDefinition{
{
Name: []string{"consul", "autopilot", "healthy"},
Help: "This tracks the overall health of the local server cluster. 1 if all servers are healthy, 0 if one or more are unhealthy.",
},
}
// TODO(kit) define these in vars in the package/file they're used
counterDefs := []prometheus.CounterDefinition{
{
Name: []string{"consul", "raft", "apply"},
Help: "This counts the number of Raft transactions occurring over the interval.",
},
{
Name: []string{"consul", "raft", "state", "candidate"},
Help: "This increments whenever a Consul server starts an election.",
},
{
Name: []string{"consul", "raft", "state", "leader"},
Help: "This increments whenever a Consul server becomes a leader.",
},
{
Name: []string{"consul", "client", "api", "catalog_register"},
Help: "Increments whenever a Consul agent receives a catalog register request.",
},
{
Name: []string{"consul", "runtime", "total_gc_pause_ns"},
Help: "Number of nanoseconds consumed by stop-the-world garbage collection (GC) pauses since Consul started.",
},
{
Name: []string{"consul", "client", "rpc"},
Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server.",
},
{
Name: []string{"consul", "client", "rpc", "exceeded"},
Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server gets rate limited by that agent's limits configuration.",
},
{
Name: []string{"consul", "client", "rpc", "failed"},
Help: "Increments whenever a Consul agent in client mode makes an RPC request to a Consul server and fails.",
},
}
// TODO(kit) define these in vars in the package/file they're used
summaryDefs := []prometheus.SummaryDefinition{
{
Name: []string{"consul", "kvs", "apply"},
Help: "This measures the time it takes to complete an update to the KV store.",
},
{
Name: []string{"consul", "txn", "apply"},
Help: "This measures the time spent applying a transaction operation.",
},
{
Name: []string{"consul", "raft", "commitTime"},
Help: "This measures the time it takes to commit a new entry to the Raft log on the leader.",
},
{
Name: []string{"consul", "raft", "leader", "lastContact"},
Help: "Measures the time since the leader was last able to contact the follower nodes when checking its leader lease.",
},
}
prometheusOpts := prometheus.PrometheusOpts{
Expiration: cfg.PrometheusRetentionTime,
GaugeDefinitions: gaugeDefs,
CounterDefinitions: counterDefs,
SummaryDefinitions: summaryDefs,
}
sink, err := prometheus.NewPrometheusSinkFrom(prometheusOpts)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -440,6 +376,9 @@ func InitTelemetry(cfg TelemetryConfig) (*metrics.InmemSink, error) {
if err := addSink(circonusSink); err != nil { if err := addSink(circonusSink); err != nil {
return nil, err return nil, err
} }
if err := addSink(circonusSink); err != nil {
return nil, err
}
if err := addSink(prometheusSink); err != nil { if err := addSink(prometheusSink); err != nil {
return nil, err return nil, err
} }

View File

@ -5,11 +5,14 @@ import (
"testing" "testing"
"time" "time"
"github.com/armon/go-metrics/prometheus"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
func makeFullTelemetryConfig(t *testing.T) TelemetryConfig { func makeFullTelemetryConfig(t *testing.T) TelemetryConfig {
var ( var (
promOpts = prometheus.PrometheusOpts{}
strSliceVal = []string{"foo"} strSliceVal = []string{"foo"}
strVal = "foo" strVal = "foo"
intVal = int64(1 * time.Second) intVal = int64(1 * time.Second)
@ -27,6 +30,12 @@ func makeFullTelemetryConfig(t *testing.T) TelemetryConfig {
// now for brevity but will fail the test if a new field type is added since // now for brevity but will fail the test if a new field type is added since
// this is likely not implemented in MergeDefaults either. // this is likely not implemented in MergeDefaults either.
switch f.Kind() { switch f.Kind() {
case reflect.Struct:
if f.Type() != reflect.TypeOf(promOpts) {
t.Fatalf("unknown struct type in TelemetryConfig: actual %v, expected: %v", f.Type(), reflect.TypeOf(promOpts))
}
// TODO(kit): This should delve into the fields and set them individually rather than using an empty struct
f.Set(reflect.ValueOf(promOpts))
case reflect.Slice: case reflect.Slice:
if f.Type() != reflect.TypeOf(strSliceVal) { if f.Type() != reflect.TypeOf(strSliceVal) {
t.Fatalf("unknown slice type in TelemetryConfig." + t.Fatalf("unknown slice type in TelemetryConfig." +