From bcda205f88c57ca35e8d8dc59f7d9b037b28e713 Mon Sep 17 00:00:00 2001 From: Paul Glass Date: Wed, 12 Oct 2022 14:17:58 -0500 Subject: [PATCH] Add consul.xds.server.streamStart metric (#14957) This adds a new consul.xds.server.streamStart metric to measure the time taken to first generate xDS resources after an xDS stream is opened. --- .changelog/14957.txt | 3 +++ agent/setup.go | 4 ++-- agent/xds/delta.go | 8 ++++++++ agent/xds/delta_test.go | 15 ++++++++++++++- agent/xds/server.go | 7 +++++++ website/content/docs/agent/telemetry.mdx | 1 + 6 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 .changelog/14957.txt diff --git a/.changelog/14957.txt b/.changelog/14957.txt new file mode 100644 index 0000000000..2a0facf8fa --- /dev/null +++ b/.changelog/14957.txt @@ -0,0 +1,3 @@ +```release-note:improvement +telemetry: Added a `consul.xds.server.streamStart` metric to measure time taken to first generate xDS resources for an xDS stream. +``` diff --git a/agent/setup.go b/agent/setup.go index 02dc034262..7d4a648cf5 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -202,8 +202,7 @@ func newConnPool(config *config.RuntimeConfig, logger hclog.Logger, tls *tlsutil } // getPrometheusDefs reaches into every slice of prometheus defs we've defined in each part of the agent, and appends -// -// all of our slices into one nice slice of definitions per metric type for the Consul agent to pass to go-metrics. +// all of our slices into one nice slice of definitions per metric type for the Consul agent to pass to go-metrics. func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.GaugeDefinition, []prometheus.CounterDefinition, []prometheus.SummaryDefinition) { // TODO: "raft..." metrics come from the raft lib and we should migrate these to a telemetry // package within. In the mean time, we're going to define a few here because they're key to monitoring Consul. @@ -345,6 +344,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau fsm.CommandsSummaries, fsm.SnapshotSummaries, raftSummaries, + xds.StatsSummaries, } // Flatten definitions // NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique? diff --git a/agent/xds/delta.go b/agent/xds/delta.go index e87a3fd99c..dd3a131610 100644 --- a/agent/xds/delta.go +++ b/agent/xds/delta.go @@ -5,6 +5,7 @@ import ( "encoding/hex" "fmt" "sort" + "sync" "sync/atomic" "time" @@ -104,6 +105,9 @@ func (s *Server) processDelta(stream ADSDeltaStream, reqCh <-chan *envoy_discove proxyID structs.ServiceID nonce uint64 // xDS requires a unique nonce to correlate response/request pairs ready bool // set to true after the first snapshot arrives + + streamStartTime = time.Now() + streamStartOnce sync.Once ) var ( @@ -360,6 +364,10 @@ func (s *Server) processDelta(stream ADSDeltaStream, reqCh <-chan *envoy_discove generator.Logger.Trace("Invoking all xDS resource handlers and sending changed data if there are any") + streamStartOnce.Do(func() { + metrics.MeasureSince([]string{"xds", "server", "streamStart"}, streamStartTime) + }) + for _, op := range xDSUpdateOrder { err, sent := handlers[op.TypeUrl].SendIfNew( cfgSnap.Kind, diff --git a/agent/xds/delta_test.go b/agent/xds/delta_test.go index c2706c768d..15da3beadb 100644 --- a/agent/xds/delta_test.go +++ b/agent/xds/delta_test.go @@ -1448,7 +1448,7 @@ func TestServer_DeltaAggregatedResources_v3_StreamDrained(t *testing.T) { } }) - testutil.RunStep(t, "check drain counter incremeted", func(t *testing.T) { + testutil.RunStep(t, "check drain counter incremented", func(t *testing.T) { data := scenario.sink.Data() require.Len(t, data, 1) @@ -1459,6 +1459,19 @@ func TestServer_DeltaAggregatedResources_v3_StreamDrained(t *testing.T) { require.True(t, ok) require.Equal(t, 1, val.Count) }) + + testutil.RunStep(t, "check streamStart metric recorded", func(t *testing.T) { + data := scenario.sink.Data() + require.Len(t, data, 1) + + item := data[0] + require.Len(t, item.Counters, 1) + + val, ok := item.Samples["consul.xds.test.xds.server.streamStart"] + require.True(t, ok) + require.Equal(t, 1, val.Count) + }) + } type testLimiter struct { diff --git a/agent/xds/server.go b/agent/xds/server.go index 7252a6b877..828a4e2021 100644 --- a/agent/xds/server.go +++ b/agent/xds/server.go @@ -37,6 +37,13 @@ var StatsCounters = []prometheus.CounterDefinition{ }, } +var StatsSummaries = []prometheus.SummaryDefinition{ + { + Name: []string{"xds", "server", "streamStart"}, + Help: "Measures the time in milliseconds after an xDS stream is opened until xDS resources are first generated for the stream.", + }, +} + // ADSStream is a shorter way of referring to this thing... type ADSStream = envoy_discovery_v3.AggregatedDiscoveryService_StreamAggregatedResourcesServer diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 0e5e8e105a..79ae6d9c14 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -542,6 +542,7 @@ These metrics are used to monitor the health of the Consul servers. | `consul.xds.server.streams` | Measures the number of active xDS streams handled by the server split by protocol version. | streams | gauge | | `consul.xds.server.idealStreamsMax` | The maximum number of xDS streams per server, chosen to achieve a roughly even spread of load across servers. | streams | gauge | | `consul.xds.server.streamDrained` | Counts the number of xDS streams that are drained when rebalancing the load between servers. | streams | counter | +| `consul.xds.server.streamStart` | Measures the time taken to first generate xDS resources after an xDS stream is opened. | ms | timer | ## Server Workload