usagemetrics: add cluster members to metrics API (#10340)

This PR adds cluster members to the metrics API. The number of members per
segment are reported as well as the total number of members.

Tested by running a multi-node cluster locally and ensuring the numbers were
correct. Also added unit test coverage to add the new expected gauges to
existing test cases.
This commit is contained in:
Paul Ewing 2021-06-03 08:25:53 -07:00 committed by GitHub
parent 7f47603c1a
commit 42a51b1a2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 218 additions and 13 deletions

3
.changelog/10340.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
telemetry: The usage data in the `metrics` API now includes cluster member counts, reporting clients on a per segment basis.
```

View File

@ -569,7 +569,15 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
WithStateProvider(s.fsm).
WithLogger(s.logger).
WithDatacenter(s.config.Datacenter).
WithReportingInterval(s.config.MetricsReportingInterval),
WithReportingInterval(s.config.MetricsReportingInterval).
WithGetMembersFunc(func() []serf.Member {
members, err := s.LANMembersAllSegments()
if err != nil {
return []serf.Member{}
}
return members
}),
)
if err != nil {
s.Shutdown()
@ -1138,7 +1146,7 @@ func (s *Server) LANMembers() []serf.Member {
return s.serfLAN.Members()
}
// WANMembers is used to return the members of the LAN cluster
// WANMembers is used to return the members of the WAN cluster
func (s *Server) WANMembers() []serf.Member {
if s.serfWAN == nil {
return nil

View File

@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/serf/serf"
)
var Gauges = []prometheus.GaugeDefinition{
@ -26,8 +27,18 @@ var Gauges = []prometheus.GaugeDefinition{
Name: []string{"consul", "state", "service_instances"},
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
},
{
Name: []string{"consul", "members", "clients"},
Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
{
Name: []string{"consul", "members", "servers"},
Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
}
type getMembersFunc func() []serf.Member
// Config holds the settings for various parameters for the
// UsageMetricsReporter
type Config struct {
@ -35,6 +46,7 @@ type Config struct {
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
getMembersFunc getMembersFunc
}
// WithDatacenter adds the datacenter as a label to all metrics emitted by the
@ -63,6 +75,12 @@ func (c *Config) WithStateProvider(sp StateProvider) *Config {
return c
}
// WithGetMembersFunc specifies the function used to identify cluster members
func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config {
c.getMembersFunc = fn
return c
}
// StateProvider defines an inteface for retrieving a state.Store handle. In
// non-test code, this is satisfied by the fsm.FSM struct.
type StateProvider interface {
@ -77,6 +95,7 @@ type UsageMetricsReporter struct {
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
getMembersFunc getMembersFunc
}
func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
@ -84,6 +103,10 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
return nil, errors.New("must provide a StateProvider to usage reporter")
}
if cfg.getMembersFunc == nil {
return nil, errors.New("must provide a getMembersFunc to usage reporter")
}
if cfg.logger == nil {
cfg.logger = hclog.NewNullLogger()
}
@ -98,6 +121,7 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
stateProvider: cfg.stateProvider,
metricLabels: cfg.metricLabels,
tickerInterval: cfg.tickerInterval,
getMembersFunc: cfg.getMembersFunc,
}
return u, nil
@ -137,4 +161,66 @@ func (u *UsageMetricsReporter) runOnce() {
}
u.emitServiceUsage(serviceUsage)
servers, clients := u.memberUsage()
u.emitMemberUsage(servers, clients)
}
func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) {
if u.getMembersFunc == nil {
return 0, nil
}
mems := u.getMembersFunc()
if len(mems) <= 0 {
u.logger.Warn("cluster reported zero members")
return 0, nil
}
servers := 0
clients := make(map[string]int)
for _, m := range mems {
if m.Status != serf.StatusAlive {
continue
}
switch m.Tags["role"] {
case "node":
clients[m.Tags["segment"]]++
case "consul":
servers++
}
}
return servers, clients
}
func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) {
totalClients := 0
for seg, c := range clients {
segmentLabel := metrics.Label{Name: "segment", Value: seg}
labels := append([]metrics.Label{segmentLabel}, u.metricLabels...)
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(c),
labels,
)
totalClients += c
}
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(totalClients),
u.metricLabels,
)
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "servers"},
float32(servers),
u.metricLabels,
)
}

View File

@ -12,6 +12,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/serf/serf"
)
func newStateStore() (*state.Store, error) {
@ -21,6 +22,7 @@ func newStateStore() (*state.Store, error) {
func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store)
getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue
}
cases := map[string]testCase{
@ -45,24 +47,64 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 0,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 0,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
},
getMembersFunc: func() []serf.Member { return []serf.Member{} },
},
"nodes-and-services": {
modfiyStateStore: func(t *testing.T, s *state.Store) {
require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"}))
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"}))
// Typical services and some consul services spread across two nodes
require.Nil(t, s.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(5, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(6, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(7, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
},
getMembersFunc: func() []serf.Member {
return []serf.Member{
{
Name: "foo",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "bar",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "baz",
Tags: map[string]string{"role": "node", "segment": "a"},
Status: serf.StatusAlive,
},
{
Name: "qux",
Tags: map[string]string{"role": "node", "segment": "b"},
Status: serf.StatusAlive,
},
}
},
expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes",
Value: 3,
Value: 4,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.state.services;datacenter=dc1": {
@ -79,6 +121,36 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 2,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 2,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{
{Name: "segment", Value: "a"},
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{
{Name: "segment", Value: "b"},
{Name: "datacenter", Value: "dc1"},
},
},
},
},
}
@ -102,7 +174,8 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
new(Config).
WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)).
WithDatacenter("dc1"),
WithDatacenter("dc1").
WithGetMembersFunc(tcase.getMembersFunc),
)
require.NoError(t, err)

View File

@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/serf/serf"
)
type mockStateProvider struct {
@ -25,6 +26,7 @@ func (m *mockStateProvider) State() *state.Store {
func TestUsageReporter_Run_Nodes(t *testing.T) {
type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store)
getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue
}
cases := map[string]testCase{
@ -36,6 +38,7 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
},
getMembersFunc: func() []serf.Member { return []serf.Member{} },
},
"nodes": {
modfiyStateStore: func(t *testing.T, s *state.Store) {
@ -43,12 +46,41 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
},
getMembersFunc: func() []serf.Member {
return []serf.Member{
{
Name: "foo",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "bar",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "baz",
Tags: map[string]string{"role": "node"},
Status: serf.StatusAlive,
},
}
},
expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes",
Value: 3,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 2,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
},
},
}
@ -73,7 +105,8 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
new(Config).
WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)).
WithDatacenter("dc1"),
WithDatacenter("dc1").
WithGetMembersFunc(tcase.getMembersFunc),
)
require.NoError(t, err)

View File

@ -255,14 +255,14 @@ reflect what would happen if an agent restarts now.
| :-------------------------------- | :--------------------------------------------------------------- | :---- | :---- |
| `consul.system.licenseExpiration` | Number of hours until the Consul Enterprise license will expire. | hours | gauge |
**Why they're important:**
**Why they're important:**
This measurement indicates how many hours are left before the Consul Enterprise license expires. When the license expires some
Consul Enterprise features will cease to work. An example of this is that after expiration, it is no longer possible to create
or modify resources in non-default namespaces or to manage namespace definitions themselves even though reads of namespaced
or modify resources in non-default namespaces or to manage namespace definitions themselves even though reads of namespaced
resources will still work.
**What to look for:**
**What to look for:**
This metric should be monitored to ensure that the license doesn't expire to prevent degradation of functionality.
@ -313,11 +313,13 @@ This is a full list of metrics emitted by Consul.
| `consul.state.nodes` | Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.state.services` | Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.state.service_instances` | Measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.members.clients` | Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of clients | gauge |
| `consul.members.servers` | Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of servers | gauge |
| `consul.dns.stale_queries` | Increments when an agent serves a query within the allowed stale threshold. | queries | counter |
| `consul.dns.ptr_query.` | Measures the time spent handling a reverse DNS query for the given node. | ms | timer |
| `consul.dns.domain_query.` | Measures the time spent handling a domain query for the given node. | ms | timer |
| `consul.http...` | DEPRECATED IN 1.9: Tracks how long it takes to service the given HTTP request for the given verb and path. Paths do not include details like service or key names, for these an underscore will be present as a placeholder (eg. `consul.http.GET.v1.kv._`) | ms | timer |
| `consul.system.licenseExpiration` | <EnterpriseAlert inline /> This measures the number of hours remaining on the agents license. | hours | gauge |
| `consul.system.licenseExpiration` | <EnterpriseAlert inline /> This measures the number of hours remaining on the agents license. | hours | gauge |
| `consul.version` | Measures the count of running agents. | agents | guage |
## Server Health