modify metrics collector so it uses standard gauges

This commit is contained in:
gmega 2023-11-08 12:25:19 -03:00
parent b50469f6cc
commit 29d36b51f2
No known key found for this signature in database
GPG Key ID: FFD8DAF00660270F
3 changed files with 106 additions and 94 deletions

View File

@ -31,8 +31,8 @@ type
MetricsSummary* = Table[ptr SrcLoc, OverallMetrics]
var
perFutureMetrics: Table[uint, FutureMetrics]
futureSummaryMetrics: MetricsSummary
perFutureMetrics {.threadvar.}: Table[uint, FutureMetrics]
futureSummaryMetrics {.threadvar.}: MetricsSummary
proc getFutureSummaryMetrics*(): MetricsSummary {.gcsafe.} =
## get a copy of the table of summary metrics for all futures.

View File

@ -8,83 +8,104 @@ import metrics
when defined(metrics):
type
AsyncProfilerInfo* = ref object of Gauge
AsyncProfilerInfo* = ref object of RootObj
perfSampler: PerfSampler
sampleInterval: int
k: int
PerfSampler = proc (): MetricsSummary {.raises: [].}
ProfilerMetric = (SrcLoc, OverallMetrics)
const locationLabels = ["proc", "file", "line"]
# Per-proc Metrics
declarePublicGauge(
chronos_exec_time_total,
"total time in which this proc actively occupied the event loop thread",
labels = locationLabels,
)
declarePublicGauge(
chronos_run_time_total,
"chronos_exec_time_total of this proc plus of all its children (procs" &
"that this proc called and awaited for)",
labels = locationLabels,
)
declarePublicGauge(
chronos_wall_time_total,
"the amount of time elapsed from when the async proc was started to when" &
"it completed",
labels = locationLabels,
)
declarePublicGauge(
chronos_call_count_total,
"the total number of times this async proc was called and completed",
labels = locationLabels,
)
# Per-proc Statistics
declarePublicGauge(
chronos_single_exec_time_max,
"the maximum execution time for a single call of this proc",
labels = locationLabels,
)
# Global Statistics
declarePublicGauge(
chronos_largest_exec_time_total,
"the largest chronos_exec_time_total of all procs",
)
declarePublicGauge(
chronos_largest_exec_time_max,
"the largest chronos_single_exec_time_max of all procs",
)
proc newCollector*(
AsyncProfilerInfo: typedesc,
name: string,
help: string,
perfSampler: PerfSampler,
k: int = 10,
registry: Registry = defaultRegistry,
): AsyncProfilerInfo =
result = AsyncProfilerInfo.newCollector(
name = name, help = help, registry = registry)
result.perfSampler = perfSampler
result.k = k
proc metricValue(duration: timer.Duration):
float64 = duration.nanoseconds.float64
): AsyncProfilerInfo = AsyncProfilerInfo(perfSampler: perfSampler, k: k)
proc collectSlowestProcs(
self: AsyncProfilerInfo,
profilerMetrics: seq[ProfilerMetric],
prometheusMetrics: var Metrics,
timestampMillis: int64,
k: int,
): void =
const locationLabelsKeys = @["proc", "file", "line"]
for (i, pair) in enumerate(profilerMetrics):
if i == k:
break
let (location, metrics) = pair
proc addLabeledMetric(name: string,
value: timer.Duration,
prometheusMetrics: var Metrics): void =
let labelValues = @[
let locationLabels = @[
$(location.procedure),
$(location.file),
$(location.line),
]
var procMetrics = prometheusMetrics.mGetOrPut(labelValues, @[])
chronos_exec_time_total.set(metrics.totalExecTime.nanoseconds,
labelValues = locationLabels)
procMetrics.add(
Metric(
name: name,
value: value.metricValue(),
labels: locationLabelsKeys,
labelValues: labelValues,
)
)
chronos_run_time_total.set(metrics.totalRunTime.nanoseconds,
labelValues = locationLabels)
# If you don't reassign, your modifications are simply lost due to nim's
# weird var semantics.
prometheusMetrics[labelValues] = procMetrics
chronos_wall_time_total.set(metrics.totalWallTime.nanoseconds,
labelValues = locationLabels)
addLabeledMetric(
"chronos_exec_time_total", metrics.totalExecTime, prometheusMetrics)
addLabeledMetric(
"chronos_run_time_total", metrics.totalRunTime, prometheusMetrics)
addLabeledMetric(
"chronos_wall_time_total", metrics.totalWallTime, prometheusMetrics)
addLabeledMetric(
"chronos_single_exec_time_max", metrics.maxSingleTime, prometheusMetrics)
chronos_single_exec_time_max.set(metrics.maxSingleTime.nanoseconds,
labelValues = locationLabels)
chronos_call_count_total.set(metrics.count, labelValues = locationLabels)
proc collectOutlierMetrics(
self: AsyncProfilerInfo,
profilerMetrics: seq[ProfilerMetric],
prometheusMetrics: var Metrics,
timestampMillis: int64,
): void =
## Adds summary metrics for the procs that have the highest exec time
@ -100,24 +121,12 @@ when defined(metrics):
if metric.totalExecTime > largestExecTime:
largestExecTime = metric.totalExecTime
prometheusMetrics[@[]].add(Metric(
name: "chronos_largest_exec_time_total",
value: largestExecTime.metricValue(),
timestamp: timestampMillis,
))
chronos_largest_exec_time_total.set(largestExecTime.nanoseconds)
chronos_largest_exec_time_max.set(largestMaxExecTime.nanoseconds)
prometheusMetrics[@[]].add(Metric(
name: "chronos_largest_exec_time_max",
value: largestMaxExecTime.metricValue(),
timestamp: timestampMillis,
))
method collect*(self: AsyncProfilerInfo): Metrics =
proc collect*(self: AsyncProfilerInfo): void =
let now = times.getTime().toMilliseconds()
var prometheusMetrics = Metrics()
prometheusMetrics[@[]] = newSeq[Metric]()
var currentMetrics = self.
perfSampler().
pairs.
@ -132,14 +141,25 @@ when defined(metrics):
order = SortOrder.Descending
)
# otherwise the compiler keeps complaining of a phantom KeyError
{.cast(raises:[]).}:
self.collectOutlierMetrics(currentMetrics, prometheusMetrics, now)
self.collectSlowestProcs(currentMetrics, prometheusMetrics, now, self.k)
self.collectOutlierMetrics(currentMetrics, now)
self.collectSlowestProcs(currentMetrics, now, self.k)
prometheusMetrics
proc resetMetric(gauge: Gauge): void =
# We try to be as conservative as possible and not write directly to
# internal state. We do need to read from it, though.
for labelValues in gauge.metrics.keys:
gauge.set(0.int64, labelValues = labelValues)
var asyncProfilerInfo* {.global.} = AsyncProfilerInfo.newCollector(
"async_profiler_info", "Async profiler info",
perfSampler = getFutureSummaryMetrics
)
proc reset*(self: AsyncProfilerInfo): void =
resetMetric(chronos_exec_time_total)
resetMetric(chronos_run_time_total)
resetMetric(chronos_wall_time_total)
resetMetric(chronos_call_count_total)
resetMetric(chronos_single_exec_time_max)
resetMetric(chronos_largest_exec_time_total)
resetMetric(chronos_largest_exec_time_max)
proc run*(self: AsyncProfilerInfo) {.async.} =
while true:
self.collect()
await sleepAsync(timer.milliseconds(self.sampleInterval))

View File

@ -6,7 +6,7 @@ import codex/utils/asyncprofiler
import ../../helpers
checksuite "asyncprofiler metrics collector":
suite "asyncprofiler metrics collector":
var locations = @[
SrcLoc(procedure: "start", file: "discovery.nim", line: 174),
@ -50,36 +50,28 @@ checksuite "asyncprofiler metrics collector":
),
}.toTable
test "should keep track of basic worst-case exec time stats":
var registry = newRegistry()
var collector = AsyncProfilerInfo.newCollector(
name = "profiling_metrics",
help = "Metrics from the profiler",
registry = registry,
perfSampler = proc (): MetricsSummary = sample
perfSampler = proc (): MetricsSummary = sample,
k = 3,
)
check collector.valueByName("chronos_largest_exec_time_total") == 91660
check collector.valueByName("chronos_largest_exec_time_max") == 81660
setup:
collector.reset()
collector.collect()
test "should keep track of basic worst-case exec time stats":
check chronos_largest_exec_time_total.value == 91660
check chronos_largest_exec_time_max.value == 81660
test "should create labeled series for the k slowest procs in terms of totalExecTime":
var registry = newRegistry()
var collector = AsyncProfilerInfo.newCollector(
name = "profiling_metrics",
help = "Metrics from the profiler",
registry = registry,
k = 3,
perfSampler = proc (): MetricsSummary = sample
)
check collector.valueByName("chronos_exec_time_total",
check chronos_exec_time_total.value(
labelValues = @["start", "discovery.nim", "192"]) == 91660
check collector.valueByName("chronos_exec_time_total",
check chronos_exec_time_total.value(
labelValues = @["start", "discovery.nim", "174"]) == 90062
check collector.valueByName("chronos_exec_time_total",
check chronos_exec_time_total.value(
labelValues = @["update", "sqliteds.nim", "107"]) == 60645
# This is out of the top-k slowest, so should not have been recorded.
expect system.KeyError:
discard collector.valueByName("chronos_exec_time_total",
discard chronos_exec_time_total.value(
labelValues = @["query", "manager.nim", "323"])