usagemetrics: add cluster members to metrics API (#10340)

This PR adds cluster members to the metrics API. The number of members per
segment are reported as well as the total number of members.

Tested by running a multi-node cluster locally and ensuring the numbers were
correct. Also added unit test coverage to add the new expected gauges to
existing test cases.
This commit is contained in:
Paul Ewing 2021-06-03 08:25:53 -07:00 committed by GitHub
parent 7f47603c1a
commit 42a51b1a2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 218 additions and 13 deletions

3
.changelog/10340.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
telemetry: The usage data in the `metrics` API now includes cluster member counts, reporting clients on a per segment basis.
```

View File

@ -569,7 +569,15 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
WithStateProvider(s.fsm).
WithLogger(s.logger).
WithDatacenter(s.config.Datacenter).
WithReportingInterval(s.config.MetricsReportingInterval),
WithReportingInterval(s.config.MetricsReportingInterval).
WithGetMembersFunc(func() []serf.Member {
members, err := s.LANMembersAllSegments()
if err != nil {
return []serf.Member{}
}
return members
}),
)
if err != nil {
s.Shutdown()
@ -1138,7 +1146,7 @@ func (s *Server) LANMembers() []serf.Member {
return s.serfLAN.Members()
}
// WANMembers is used to return the members of the LAN cluster
// WANMembers is used to return the members of the WAN cluster
func (s *Server) WANMembers() []serf.Member {
if s.serfWAN == nil {
return nil

View File

@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/serf/serf"
)
var Gauges = []prometheus.GaugeDefinition{
@ -26,8 +27,18 @@ var Gauges = []prometheus.GaugeDefinition{
Name: []string{"consul", "state", "service_instances"},
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
},
{
Name: []string{"consul", "members", "clients"},
Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
{
Name: []string{"consul", "members", "servers"},
Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
},
}
type getMembersFunc func() []serf.Member
// Config holds the settings for various parameters for the
// UsageMetricsReporter
type Config struct {
@ -35,6 +46,7 @@ type Config struct {
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
getMembersFunc getMembersFunc
}
// WithDatacenter adds the datacenter as a label to all metrics emitted by the
@ -63,6 +75,12 @@ func (c *Config) WithStateProvider(sp StateProvider) *Config {
return c
}
// WithGetMembersFunc specifies the function used to identify cluster members
func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config {
c.getMembersFunc = fn
return c
}
// StateProvider defines an inteface for retrieving a state.Store handle. In
// non-test code, this is satisfied by the fsm.FSM struct.
type StateProvider interface {
@ -77,6 +95,7 @@ type UsageMetricsReporter struct {
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
getMembersFunc getMembersFunc
}
func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
@ -84,6 +103,10 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
return nil, errors.New("must provide a StateProvider to usage reporter")
}
if cfg.getMembersFunc == nil {
return nil, errors.New("must provide a getMembersFunc to usage reporter")
}
if cfg.logger == nil {
cfg.logger = hclog.NewNullLogger()
}
@ -98,6 +121,7 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
stateProvider: cfg.stateProvider,
metricLabels: cfg.metricLabels,
tickerInterval: cfg.tickerInterval,
getMembersFunc: cfg.getMembersFunc,
}
return u, nil
@ -137,4 +161,66 @@ func (u *UsageMetricsReporter) runOnce() {
}
u.emitServiceUsage(serviceUsage)
servers, clients := u.memberUsage()
u.emitMemberUsage(servers, clients)
}
func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) {
if u.getMembersFunc == nil {
return 0, nil
}
mems := u.getMembersFunc()
if len(mems) <= 0 {
u.logger.Warn("cluster reported zero members")
return 0, nil
}
servers := 0
clients := make(map[string]int)
for _, m := range mems {
if m.Status != serf.StatusAlive {
continue
}
switch m.Tags["role"] {
case "node":
clients[m.Tags["segment"]]++
case "consul":
servers++
}
}
return servers, clients
}
func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) {
totalClients := 0
for seg, c := range clients {
segmentLabel := metrics.Label{Name: "segment", Value: seg}
labels := append([]metrics.Label{segmentLabel}, u.metricLabels...)
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(c),
labels,
)
totalClients += c
}
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "clients"},
float32(totalClients),
u.metricLabels,
)
metrics.SetGaugeWithLabels(
[]string{"consul", "members", "servers"},
float32(servers),
u.metricLabels,
)
}

View File

@ -12,6 +12,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/serf/serf"
)
func newStateStore() (*state.Store, error) {
@ -21,6 +22,7 @@ func newStateStore() (*state.Store, error) {
func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store)
getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue
}
cases := map[string]testCase{
@ -45,24 +47,64 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 0,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 0,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
},
getMembersFunc: func() []serf.Member { return []serf.Member{} },
},
"nodes-and-services": {
modfiyStateStore: func(t *testing.T, s *state.Store) {
require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"}))
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"}))
// Typical services and some consul services spread across two nodes
require.Nil(t, s.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(5, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(6, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(7, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
},
getMembersFunc: func() []serf.Member {
return []serf.Member{
{
Name: "foo",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "bar",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "baz",
Tags: map[string]string{"role": "node", "segment": "a"},
Status: serf.StatusAlive,
},
{
Name: "qux",
Tags: map[string]string{"role": "node", "segment": "b"},
Status: serf.StatusAlive,
},
}
},
expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes",
Value: 3,
Value: 4,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.state.services;datacenter=dc1": {
@ -79,6 +121,36 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 2,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 2,
Labels: []metrics.Label{
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{
{Name: "segment", Value: "a"},
{Name: "datacenter", Value: "dc1"},
},
},
"consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{
{Name: "segment", Value: "b"},
{Name: "datacenter", Value: "dc1"},
},
},
},
},
}
@ -102,7 +174,8 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
new(Config).
WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)).
WithDatacenter("dc1"),
WithDatacenter("dc1").
WithGetMembersFunc(tcase.getMembersFunc),
)
require.NoError(t, err)

View File

@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/serf/serf"
)
type mockStateProvider struct {
@ -25,6 +26,7 @@ func (m *mockStateProvider) State() *state.Store {
func TestUsageReporter_Run_Nodes(t *testing.T) {
type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store)
getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue
}
cases := map[string]testCase{
@ -36,6 +38,7 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
},
getMembersFunc: func() []serf.Member { return []serf.Member{} },
},
"nodes": {
modfiyStateStore: func(t *testing.T, s *state.Store) {
@ -43,12 +46,41 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
},
getMembersFunc: func() []serf.Member {
return []serf.Member{
{
Name: "foo",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "bar",
Tags: map[string]string{"role": "consul"},
Status: serf.StatusAlive,
},
{
Name: "baz",
Tags: map[string]string{"role": "node"},
Status: serf.StatusAlive,
},
}
},
expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes",
Value: 3,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.members.clients;datacenter=dc1": {
Name: "consul.usage.test.consul.members.clients",
Value: 1,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.members.servers;datacenter=dc1": {
Name: "consul.usage.test.consul.members.servers",
Value: 2,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
},
},
}
@ -73,7 +105,8 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
new(Config).
WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)).
WithDatacenter("dc1"),
WithDatacenter("dc1").
WithGetMembersFunc(tcase.getMembersFunc),
)
require.NoError(t, err)

View File

@ -313,6 +313,8 @@ This is a full list of metrics emitted by Consul.
| `consul.state.nodes` | Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.state.services` | Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.state.service_instances` | Measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.members.clients` | Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of clients | gauge |
| `consul.members.servers` | Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of servers | gauge |
| `consul.dns.stale_queries` | Increments when an agent serves a query within the allowed stale threshold. | queries | counter |
| `consul.dns.ptr_query.` | Measures the time spent handling a reverse DNS query for the given node. | ms | timer |
| `consul.dns.domain_query.` | Measures the time spent handling a domain query for the given node. | ms | timer |