diff --git a/.changelog/10340.txt b/.changelog/10340.txt
new file mode 100644
index 0000000000..ff2a882a25
--- /dev/null
+++ b/.changelog/10340.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+telemetry: The usage data in the `metrics` API now includes cluster member counts, reporting clients on a per segment basis.
+```
diff --git a/agent/consul/server.go b/agent/consul/server.go
index a53916aa48..08677a6b17 100644
--- a/agent/consul/server.go
+++ b/agent/consul/server.go
@@ -569,7 +569,15 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
WithStateProvider(s.fsm).
WithLogger(s.logger).
WithDatacenter(s.config.Datacenter).
- WithReportingInterval(s.config.MetricsReportingInterval),
+ WithReportingInterval(s.config.MetricsReportingInterval).
+ WithGetMembersFunc(func() []serf.Member {
+ members, err := s.LANMembersAllSegments()
+ if err != nil {
+ return []serf.Member{}
+ }
+
+ return members
+ }),
)
if err != nil {
s.Shutdown()
@@ -1138,7 +1146,7 @@ func (s *Server) LANMembers() []serf.Member {
return s.serfLAN.Members()
}
-// WANMembers is used to return the members of the LAN cluster
+// WANMembers is used to return the members of the WAN cluster
func (s *Server) WANMembers() []serf.Member {
if s.serfWAN == nil {
return nil
diff --git a/agent/consul/usagemetrics/usagemetrics.go b/agent/consul/usagemetrics/usagemetrics.go
index da09890e5f..353e9a45df 100644
--- a/agent/consul/usagemetrics/usagemetrics.go
+++ b/agent/consul/usagemetrics/usagemetrics.go
@@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog"
+ "github.com/hashicorp/serf/serf"
)
var Gauges = []prometheus.GaugeDefinition{
@@ -26,8 +27,18 @@ var Gauges = []prometheus.GaugeDefinition{
Name: []string{"consul", "state", "service_instances"},
Help: "Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0.",
},
+ {
+ Name: []string{"consul", "members", "clients"},
+ Help: "Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
+ },
+ {
+ Name: []string{"consul", "members", "servers"},
+ Help: "Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6.",
+ },
}
+type getMembersFunc func() []serf.Member
+
// Config holds the settings for various parameters for the
// UsageMetricsReporter
type Config struct {
@@ -35,6 +46,7 @@ type Config struct {
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
+ getMembersFunc getMembersFunc
}
// WithDatacenter adds the datacenter as a label to all metrics emitted by the
@@ -63,6 +75,12 @@ func (c *Config) WithStateProvider(sp StateProvider) *Config {
return c
}
+// WithGetMembersFunc specifies the function used to identify cluster members
+func (c *Config) WithGetMembersFunc(fn getMembersFunc) *Config {
+ c.getMembersFunc = fn
+ return c
+}
+
// StateProvider defines an inteface for retrieving a state.Store handle. In
// non-test code, this is satisfied by the fsm.FSM struct.
type StateProvider interface {
@@ -77,6 +95,7 @@ type UsageMetricsReporter struct {
metricLabels []metrics.Label
stateProvider StateProvider
tickerInterval time.Duration
+ getMembersFunc getMembersFunc
}
func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
@@ -84,6 +103,10 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
return nil, errors.New("must provide a StateProvider to usage reporter")
}
+ if cfg.getMembersFunc == nil {
+ return nil, errors.New("must provide a getMembersFunc to usage reporter")
+ }
+
if cfg.logger == nil {
cfg.logger = hclog.NewNullLogger()
}
@@ -98,6 +121,7 @@ func NewUsageMetricsReporter(cfg *Config) (*UsageMetricsReporter, error) {
stateProvider: cfg.stateProvider,
metricLabels: cfg.metricLabels,
tickerInterval: cfg.tickerInterval,
+ getMembersFunc: cfg.getMembersFunc,
}
return u, nil
@@ -137,4 +161,66 @@ func (u *UsageMetricsReporter) runOnce() {
}
u.emitServiceUsage(serviceUsage)
+
+ servers, clients := u.memberUsage()
+ u.emitMemberUsage(servers, clients)
+}
+
+func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) {
+ if u.getMembersFunc == nil {
+ return 0, nil
+ }
+
+ mems := u.getMembersFunc()
+ if len(mems) <= 0 {
+ u.logger.Warn("cluster reported zero members")
+ return 0, nil
+ }
+
+ servers := 0
+ clients := make(map[string]int)
+
+ for _, m := range mems {
+ if m.Status != serf.StatusAlive {
+ continue
+ }
+
+ switch m.Tags["role"] {
+ case "node":
+ clients[m.Tags["segment"]]++
+ case "consul":
+ servers++
+ }
+ }
+
+ return servers, clients
+}
+
+func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) {
+ totalClients := 0
+
+ for seg, c := range clients {
+ segmentLabel := metrics.Label{Name: "segment", Value: seg}
+ labels := append([]metrics.Label{segmentLabel}, u.metricLabels...)
+
+ metrics.SetGaugeWithLabels(
+ []string{"consul", "members", "clients"},
+ float32(c),
+ labels,
+ )
+
+ totalClients += c
+ }
+
+ metrics.SetGaugeWithLabels(
+ []string{"consul", "members", "clients"},
+ float32(totalClients),
+ u.metricLabels,
+ )
+
+ metrics.SetGaugeWithLabels(
+ []string{"consul", "members", "servers"},
+ float32(servers),
+ u.metricLabels,
+ )
}
diff --git a/agent/consul/usagemetrics/usagemetrics_oss_test.go b/agent/consul/usagemetrics/usagemetrics_oss_test.go
index d4919914ff..e232014358 100644
--- a/agent/consul/usagemetrics/usagemetrics_oss_test.go
+++ b/agent/consul/usagemetrics/usagemetrics_oss_test.go
@@ -12,6 +12,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
+ "github.com/hashicorp/serf/serf"
)
func newStateStore() (*state.Store, error) {
@@ -21,6 +22,7 @@ func newStateStore() (*state.Store, error) {
func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store)
+ getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue
}
cases := map[string]testCase{
@@ -45,24 +47,64 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"},
},
},
+ "consul.usage.test.consul.members.clients;datacenter=dc1": {
+ Name: "consul.usage.test.consul.members.clients",
+ Value: 0,
+ Labels: []metrics.Label{
+ {Name: "datacenter", Value: "dc1"},
+ },
+ },
+ "consul.usage.test.consul.members.servers;datacenter=dc1": {
+ Name: "consul.usage.test.consul.members.servers",
+ Value: 0,
+ Labels: []metrics.Label{
+ {Name: "datacenter", Value: "dc1"},
+ },
+ },
},
+ getMembersFunc: func() []serf.Member { return []serf.Member{} },
},
"nodes-and-services": {
modfiyStateStore: func(t *testing.T, s *state.Store) {
require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"}))
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
+ require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"}))
// Typical services and some consul services spread across two nodes
- require.Nil(t, s.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
- require.Nil(t, s.EnsureService(5, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
- require.Nil(t, s.EnsureService(6, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
- require.Nil(t, s.EnsureService(7, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
+ require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000}))
+ require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000}))
+ require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
+ require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil}))
+ },
+ getMembersFunc: func() []serf.Member {
+ return []serf.Member{
+ {
+ Name: "foo",
+ Tags: map[string]string{"role": "consul"},
+ Status: serf.StatusAlive,
+ },
+ {
+ Name: "bar",
+ Tags: map[string]string{"role": "consul"},
+ Status: serf.StatusAlive,
+ },
+ {
+ Name: "baz",
+ Tags: map[string]string{"role": "node", "segment": "a"},
+ Status: serf.StatusAlive,
+ },
+ {
+ Name: "qux",
+ Tags: map[string]string{"role": "node", "segment": "b"},
+ Status: serf.StatusAlive,
+ },
+ }
},
expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes",
- Value: 3,
+ Value: 4,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
"consul.usage.test.consul.state.services;datacenter=dc1": {
@@ -79,6 +121,36 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
{Name: "datacenter", Value: "dc1"},
},
},
+ "consul.usage.test.consul.members.clients;datacenter=dc1": {
+ Name: "consul.usage.test.consul.members.clients",
+ Value: 2,
+ Labels: []metrics.Label{
+ {Name: "datacenter", Value: "dc1"},
+ },
+ },
+ "consul.usage.test.consul.members.servers;datacenter=dc1": {
+ Name: "consul.usage.test.consul.members.servers",
+ Value: 2,
+ Labels: []metrics.Label{
+ {Name: "datacenter", Value: "dc1"},
+ },
+ },
+ "consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": {
+ Name: "consul.usage.test.consul.members.clients",
+ Value: 1,
+ Labels: []metrics.Label{
+ {Name: "segment", Value: "a"},
+ {Name: "datacenter", Value: "dc1"},
+ },
+ },
+ "consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": {
+ Name: "consul.usage.test.consul.members.clients",
+ Value: 1,
+ Labels: []metrics.Label{
+ {Name: "segment", Value: "b"},
+ {Name: "datacenter", Value: "dc1"},
+ },
+ },
},
},
}
@@ -102,7 +174,8 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) {
new(Config).
WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)).
- WithDatacenter("dc1"),
+ WithDatacenter("dc1").
+ WithGetMembersFunc(tcase.getMembersFunc),
)
require.NoError(t, err)
diff --git a/agent/consul/usagemetrics/usagemetrics_test.go b/agent/consul/usagemetrics/usagemetrics_test.go
index cd34581c61..1c4be1d5b1 100644
--- a/agent/consul/usagemetrics/usagemetrics_test.go
+++ b/agent/consul/usagemetrics/usagemetrics_test.go
@@ -11,6 +11,7 @@ import (
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
+ "github.com/hashicorp/serf/serf"
)
type mockStateProvider struct {
@@ -25,6 +26,7 @@ func (m *mockStateProvider) State() *state.Store {
func TestUsageReporter_Run_Nodes(t *testing.T) {
type testCase struct {
modfiyStateStore func(t *testing.T, s *state.Store)
+ getMembersFunc getMembersFunc
expectedGauges map[string]metrics.GaugeValue
}
cases := map[string]testCase{
@@ -36,6 +38,7 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
},
+ getMembersFunc: func() []serf.Member { return []serf.Member{} },
},
"nodes": {
modfiyStateStore: func(t *testing.T, s *state.Store) {
@@ -43,12 +46,41 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"}))
require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"}))
},
+ getMembersFunc: func() []serf.Member {
+ return []serf.Member{
+ {
+ Name: "foo",
+ Tags: map[string]string{"role": "consul"},
+ Status: serf.StatusAlive,
+ },
+ {
+ Name: "bar",
+ Tags: map[string]string{"role": "consul"},
+ Status: serf.StatusAlive,
+ },
+ {
+ Name: "baz",
+ Tags: map[string]string{"role": "node"},
+ Status: serf.StatusAlive,
+ },
+ }
+ },
expectedGauges: map[string]metrics.GaugeValue{
"consul.usage.test.consul.state.nodes;datacenter=dc1": {
Name: "consul.usage.test.consul.state.nodes",
Value: 3,
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
},
+ "consul.usage.test.consul.members.clients;datacenter=dc1": {
+ Name: "consul.usage.test.consul.members.clients",
+ Value: 1,
+ Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
+ },
+ "consul.usage.test.consul.members.servers;datacenter=dc1": {
+ Name: "consul.usage.test.consul.members.servers",
+ Value: 2,
+ Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
+ },
},
},
}
@@ -73,7 +105,8 @@ func TestUsageReporter_Run_Nodes(t *testing.T) {
new(Config).
WithStateProvider(mockStateProvider).
WithLogger(testutil.Logger(t)).
- WithDatacenter("dc1"),
+ WithDatacenter("dc1").
+ WithGetMembersFunc(tcase.getMembersFunc),
)
require.NoError(t, err)
diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx
index f94dca0ede..50dfcbd9c2 100644
--- a/website/content/docs/agent/telemetry.mdx
+++ b/website/content/docs/agent/telemetry.mdx
@@ -255,14 +255,14 @@ reflect what would happen if an agent restarts now.
| :-------------------------------- | :--------------------------------------------------------------- | :---- | :---- |
| `consul.system.licenseExpiration` | Number of hours until the Consul Enterprise license will expire. | hours | gauge |
-**Why they're important:**
+**Why they're important:**
This measurement indicates how many hours are left before the Consul Enterprise license expires. When the license expires some
Consul Enterprise features will cease to work. An example of this is that after expiration, it is no longer possible to create
-or modify resources in non-default namespaces or to manage namespace definitions themselves even though reads of namespaced
+or modify resources in non-default namespaces or to manage namespace definitions themselves even though reads of namespaced
resources will still work.
-**What to look for:**
+**What to look for:**
This metric should be monitored to ensure that the license doesn't expire to prevent degradation of functionality.
@@ -313,11 +313,13 @@ This is a full list of metrics emitted by Consul.
| `consul.state.nodes` | Measures the current number of nodes registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.state.services` | Measures the current number of unique services registered with Consul, based on service name. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
| `consul.state.service_instances` | Measures the current number of unique service instances registered with Consul. It is only emitted by Consul servers. Added in v1.9.0. | number of objects | gauge |
+| `consul.members.clients` | Measures the current number of client agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of clients | gauge |
+| `consul.members.servers` | Measures the current number of server agents registered with Consul. It is only emitted by Consul servers. Added in v1.9.6. | number of servers | gauge |
| `consul.dns.stale_queries` | Increments when an agent serves a query within the allowed stale threshold. | queries | counter |
| `consul.dns.ptr_query.` | Measures the time spent handling a reverse DNS query for the given node. | ms | timer |
| `consul.dns.domain_query.` | Measures the time spent handling a domain query for the given node. | ms | timer |
| `consul.http...` | DEPRECATED IN 1.9: Tracks how long it takes to service the given HTTP request for the given verb and path. Paths do not include details like service or key names, for these an underscore will be present as a placeholder (eg. `consul.http.GET.v1.kv._`) | ms | timer |
-| `consul.system.licenseExpiration` | This measures the number of hours remaining on the agents license. | hours | gauge |
+| `consul.system.licenseExpiration` | This measures the number of hours remaining on the agents license. | hours | gauge |
| `consul.version` | Measures the count of running agents. | agents | guage |
## Server Health