modify metrics collector so it uses standard gauges
This commit is contained in:
parent
b50469f6cc
commit
29d36b51f2
|
@ -31,8 +31,8 @@ type
|
||||||
MetricsSummary* = Table[ptr SrcLoc, OverallMetrics]
|
MetricsSummary* = Table[ptr SrcLoc, OverallMetrics]
|
||||||
|
|
||||||
var
|
var
|
||||||
perFutureMetrics: Table[uint, FutureMetrics]
|
perFutureMetrics {.threadvar.}: Table[uint, FutureMetrics]
|
||||||
futureSummaryMetrics: MetricsSummary
|
futureSummaryMetrics {.threadvar.}: MetricsSummary
|
||||||
|
|
||||||
proc getFutureSummaryMetrics*(): MetricsSummary {.gcsafe.} =
|
proc getFutureSummaryMetrics*(): MetricsSummary {.gcsafe.} =
|
||||||
## get a copy of the table of summary metrics for all futures.
|
## get a copy of the table of summary metrics for all futures.
|
||||||
|
|
|
@ -8,83 +8,104 @@ import metrics
|
||||||
|
|
||||||
when defined(metrics):
|
when defined(metrics):
|
||||||
type
|
type
|
||||||
AsyncProfilerInfo* = ref object of Gauge
|
AsyncProfilerInfo* = ref object of RootObj
|
||||||
perfSampler: PerfSampler
|
perfSampler: PerfSampler
|
||||||
|
sampleInterval: int
|
||||||
k: int
|
k: int
|
||||||
|
|
||||||
PerfSampler = proc (): MetricsSummary {.raises: [].}
|
PerfSampler = proc (): MetricsSummary {.raises: [].}
|
||||||
|
|
||||||
ProfilerMetric = (SrcLoc, OverallMetrics)
|
ProfilerMetric = (SrcLoc, OverallMetrics)
|
||||||
|
|
||||||
|
const locationLabels = ["proc", "file", "line"]
|
||||||
|
|
||||||
|
# Per-proc Metrics
|
||||||
|
declarePublicGauge(
|
||||||
|
chronos_exec_time_total,
|
||||||
|
"total time in which this proc actively occupied the event loop thread",
|
||||||
|
labels = locationLabels,
|
||||||
|
)
|
||||||
|
|
||||||
|
declarePublicGauge(
|
||||||
|
chronos_run_time_total,
|
||||||
|
"chronos_exec_time_total of this proc plus of all its children (procs" &
|
||||||
|
"that this proc called and awaited for)",
|
||||||
|
labels = locationLabels,
|
||||||
|
)
|
||||||
|
|
||||||
|
declarePublicGauge(
|
||||||
|
chronos_wall_time_total,
|
||||||
|
"the amount of time elapsed from when the async proc was started to when" &
|
||||||
|
"it completed",
|
||||||
|
labels = locationLabels,
|
||||||
|
)
|
||||||
|
|
||||||
|
declarePublicGauge(
|
||||||
|
chronos_call_count_total,
|
||||||
|
"the total number of times this async proc was called and completed",
|
||||||
|
labels = locationLabels,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Per-proc Statistics
|
||||||
|
declarePublicGauge(
|
||||||
|
chronos_single_exec_time_max,
|
||||||
|
"the maximum execution time for a single call of this proc",
|
||||||
|
labels = locationLabels,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Global Statistics
|
||||||
|
declarePublicGauge(
|
||||||
|
chronos_largest_exec_time_total,
|
||||||
|
"the largest chronos_exec_time_total of all procs",
|
||||||
|
)
|
||||||
|
|
||||||
|
declarePublicGauge(
|
||||||
|
chronos_largest_exec_time_max,
|
||||||
|
"the largest chronos_single_exec_time_max of all procs",
|
||||||
|
)
|
||||||
|
|
||||||
proc newCollector*(
|
proc newCollector*(
|
||||||
AsyncProfilerInfo: typedesc,
|
AsyncProfilerInfo: typedesc,
|
||||||
name: string,
|
|
||||||
help: string,
|
|
||||||
perfSampler: PerfSampler,
|
perfSampler: PerfSampler,
|
||||||
k: int = 10,
|
k: int = 10,
|
||||||
registry: Registry = defaultRegistry,
|
): AsyncProfilerInfo = AsyncProfilerInfo(perfSampler: perfSampler, k: k)
|
||||||
): AsyncProfilerInfo =
|
|
||||||
result = AsyncProfilerInfo.newCollector(
|
|
||||||
name = name, help = help, registry = registry)
|
|
||||||
result.perfSampler = perfSampler
|
|
||||||
result.k = k
|
|
||||||
|
|
||||||
proc metricValue(duration: timer.Duration):
|
|
||||||
float64 = duration.nanoseconds.float64
|
|
||||||
|
|
||||||
proc collectSlowestProcs(
|
proc collectSlowestProcs(
|
||||||
self: AsyncProfilerInfo,
|
self: AsyncProfilerInfo,
|
||||||
profilerMetrics: seq[ProfilerMetric],
|
profilerMetrics: seq[ProfilerMetric],
|
||||||
prometheusMetrics: var Metrics,
|
|
||||||
timestampMillis: int64,
|
timestampMillis: int64,
|
||||||
k: int,
|
k: int,
|
||||||
): void =
|
): void =
|
||||||
|
|
||||||
const locationLabelsKeys = @["proc", "file", "line"]
|
|
||||||
|
|
||||||
for (i, pair) in enumerate(profilerMetrics):
|
for (i, pair) in enumerate(profilerMetrics):
|
||||||
if i == k:
|
if i == k:
|
||||||
break
|
break
|
||||||
|
|
||||||
let (location, metrics) = pair
|
let (location, metrics) = pair
|
||||||
|
|
||||||
proc addLabeledMetric(name: string,
|
let locationLabels = @[
|
||||||
value: timer.Duration,
|
$(location.procedure),
|
||||||
prometheusMetrics: var Metrics): void =
|
$(location.file),
|
||||||
let labelValues = @[
|
$(location.line),
|
||||||
$(location.procedure),
|
]
|
||||||
$(location.file),
|
|
||||||
$(location.line),
|
|
||||||
]
|
|
||||||
|
|
||||||
var procMetrics = prometheusMetrics.mGetOrPut(labelValues, @[])
|
chronos_exec_time_total.set(metrics.totalExecTime.nanoseconds,
|
||||||
|
labelValues = locationLabels)
|
||||||
|
|
||||||
procMetrics.add(
|
chronos_run_time_total.set(metrics.totalRunTime.nanoseconds,
|
||||||
Metric(
|
labelValues = locationLabels)
|
||||||
name: name,
|
|
||||||
value: value.metricValue(),
|
|
||||||
labels: locationLabelsKeys,
|
|
||||||
labelValues: labelValues,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# If you don't reassign, your modifications are simply lost due to nim's
|
chronos_wall_time_total.set(metrics.totalWallTime.nanoseconds,
|
||||||
# weird var semantics.
|
labelValues = locationLabels)
|
||||||
prometheusMetrics[labelValues] = procMetrics
|
|
||||||
|
|
||||||
addLabeledMetric(
|
chronos_single_exec_time_max.set(metrics.maxSingleTime.nanoseconds,
|
||||||
"chronos_exec_time_total", metrics.totalExecTime, prometheusMetrics)
|
labelValues = locationLabels)
|
||||||
addLabeledMetric(
|
|
||||||
"chronos_run_time_total", metrics.totalRunTime, prometheusMetrics)
|
chronos_call_count_total.set(metrics.count, labelValues = locationLabels)
|
||||||
addLabeledMetric(
|
|
||||||
"chronos_wall_time_total", metrics.totalWallTime, prometheusMetrics)
|
|
||||||
addLabeledMetric(
|
|
||||||
"chronos_single_exec_time_max", metrics.maxSingleTime, prometheusMetrics)
|
|
||||||
|
|
||||||
proc collectOutlierMetrics(
|
proc collectOutlierMetrics(
|
||||||
self: AsyncProfilerInfo,
|
self: AsyncProfilerInfo,
|
||||||
profilerMetrics: seq[ProfilerMetric],
|
profilerMetrics: seq[ProfilerMetric],
|
||||||
prometheusMetrics: var Metrics,
|
|
||||||
timestampMillis: int64,
|
timestampMillis: int64,
|
||||||
): void =
|
): void =
|
||||||
## Adds summary metrics for the procs that have the highest exec time
|
## Adds summary metrics for the procs that have the highest exec time
|
||||||
|
@ -100,24 +121,12 @@ when defined(metrics):
|
||||||
if metric.totalExecTime > largestExecTime:
|
if metric.totalExecTime > largestExecTime:
|
||||||
largestExecTime = metric.totalExecTime
|
largestExecTime = metric.totalExecTime
|
||||||
|
|
||||||
prometheusMetrics[@[]].add(Metric(
|
chronos_largest_exec_time_total.set(largestExecTime.nanoseconds)
|
||||||
name: "chronos_largest_exec_time_total",
|
chronos_largest_exec_time_max.set(largestMaxExecTime.nanoseconds)
|
||||||
value: largestExecTime.metricValue(),
|
|
||||||
timestamp: timestampMillis,
|
|
||||||
))
|
|
||||||
|
|
||||||
prometheusMetrics[@[]].add(Metric(
|
proc collect*(self: AsyncProfilerInfo): void =
|
||||||
name: "chronos_largest_exec_time_max",
|
|
||||||
value: largestMaxExecTime.metricValue(),
|
|
||||||
timestamp: timestampMillis,
|
|
||||||
))
|
|
||||||
|
|
||||||
method collect*(self: AsyncProfilerInfo): Metrics =
|
|
||||||
let now = times.getTime().toMilliseconds()
|
let now = times.getTime().toMilliseconds()
|
||||||
|
|
||||||
var prometheusMetrics = Metrics()
|
|
||||||
prometheusMetrics[@[]] = newSeq[Metric]()
|
|
||||||
|
|
||||||
var currentMetrics = self.
|
var currentMetrics = self.
|
||||||
perfSampler().
|
perfSampler().
|
||||||
pairs.
|
pairs.
|
||||||
|
@ -132,14 +141,25 @@ when defined(metrics):
|
||||||
order = SortOrder.Descending
|
order = SortOrder.Descending
|
||||||
)
|
)
|
||||||
|
|
||||||
# otherwise the compiler keeps complaining of a phantom KeyError
|
self.collectOutlierMetrics(currentMetrics, now)
|
||||||
{.cast(raises:[]).}:
|
self.collectSlowestProcs(currentMetrics, now, self.k)
|
||||||
self.collectOutlierMetrics(currentMetrics, prometheusMetrics, now)
|
|
||||||
self.collectSlowestProcs(currentMetrics, prometheusMetrics, now, self.k)
|
|
||||||
|
|
||||||
prometheusMetrics
|
proc resetMetric(gauge: Gauge): void =
|
||||||
|
# We try to be as conservative as possible and not write directly to
|
||||||
|
# internal state. We do need to read from it, though.
|
||||||
|
for labelValues in gauge.metrics.keys:
|
||||||
|
gauge.set(0.int64, labelValues = labelValues)
|
||||||
|
|
||||||
var asyncProfilerInfo* {.global.} = AsyncProfilerInfo.newCollector(
|
proc reset*(self: AsyncProfilerInfo): void =
|
||||||
"async_profiler_info", "Async profiler info",
|
resetMetric(chronos_exec_time_total)
|
||||||
perfSampler = getFutureSummaryMetrics
|
resetMetric(chronos_run_time_total)
|
||||||
)
|
resetMetric(chronos_wall_time_total)
|
||||||
|
resetMetric(chronos_call_count_total)
|
||||||
|
resetMetric(chronos_single_exec_time_max)
|
||||||
|
resetMetric(chronos_largest_exec_time_total)
|
||||||
|
resetMetric(chronos_largest_exec_time_max)
|
||||||
|
|
||||||
|
proc run*(self: AsyncProfilerInfo) {.async.} =
|
||||||
|
while true:
|
||||||
|
self.collect()
|
||||||
|
await sleepAsync(timer.milliseconds(self.sampleInterval))
|
||||||
|
|
|
@ -6,7 +6,7 @@ import codex/utils/asyncprofiler
|
||||||
|
|
||||||
import ../../helpers
|
import ../../helpers
|
||||||
|
|
||||||
checksuite "asyncprofiler metrics collector":
|
suite "asyncprofiler metrics collector":
|
||||||
|
|
||||||
var locations = @[
|
var locations = @[
|
||||||
SrcLoc(procedure: "start", file: "discovery.nim", line: 174),
|
SrcLoc(procedure: "start", file: "discovery.nim", line: 174),
|
||||||
|
@ -50,36 +50,28 @@ checksuite "asyncprofiler metrics collector":
|
||||||
),
|
),
|
||||||
}.toTable
|
}.toTable
|
||||||
|
|
||||||
test "should keep track of basic worst-case exec time stats":
|
var collector = AsyncProfilerInfo.newCollector(
|
||||||
var registry = newRegistry()
|
perfSampler = proc (): MetricsSummary = sample,
|
||||||
var collector = AsyncProfilerInfo.newCollector(
|
k = 3,
|
||||||
name = "profiling_metrics",
|
)
|
||||||
help = "Metrics from the profiler",
|
|
||||||
registry = registry,
|
|
||||||
perfSampler = proc (): MetricsSummary = sample
|
|
||||||
)
|
|
||||||
|
|
||||||
check collector.valueByName("chronos_largest_exec_time_total") == 91660
|
setup:
|
||||||
check collector.valueByName("chronos_largest_exec_time_max") == 81660
|
collector.reset()
|
||||||
|
collector.collect()
|
||||||
|
|
||||||
|
test "should keep track of basic worst-case exec time stats":
|
||||||
|
check chronos_largest_exec_time_total.value == 91660
|
||||||
|
check chronos_largest_exec_time_max.value == 81660
|
||||||
|
|
||||||
test "should create labeled series for the k slowest procs in terms of totalExecTime":
|
test "should create labeled series for the k slowest procs in terms of totalExecTime":
|
||||||
var registry = newRegistry()
|
check chronos_exec_time_total.value(
|
||||||
var collector = AsyncProfilerInfo.newCollector(
|
|
||||||
name = "profiling_metrics",
|
|
||||||
help = "Metrics from the profiler",
|
|
||||||
registry = registry,
|
|
||||||
k = 3,
|
|
||||||
perfSampler = proc (): MetricsSummary = sample
|
|
||||||
)
|
|
||||||
|
|
||||||
check collector.valueByName("chronos_exec_time_total",
|
|
||||||
labelValues = @["start", "discovery.nim", "192"]) == 91660
|
labelValues = @["start", "discovery.nim", "192"]) == 91660
|
||||||
check collector.valueByName("chronos_exec_time_total",
|
check chronos_exec_time_total.value(
|
||||||
labelValues = @["start", "discovery.nim", "174"]) == 90062
|
labelValues = @["start", "discovery.nim", "174"]) == 90062
|
||||||
check collector.valueByName("chronos_exec_time_total",
|
check chronos_exec_time_total.value(
|
||||||
labelValues = @["update", "sqliteds.nim", "107"]) == 60645
|
labelValues = @["update", "sqliteds.nim", "107"]) == 60645
|
||||||
|
|
||||||
# This is out of the top-k slowest, so should not have been recorded.
|
# This is out of the top-k slowest, so should not have been recorded.
|
||||||
expect system.KeyError:
|
expect system.KeyError:
|
||||||
discard collector.valueByName("chronos_exec_time_total",
|
discard chronos_exec_time_total.value(
|
||||||
labelValues = @["query", "manager.nim", "323"])
|
labelValues = @["query", "manager.nim", "323"])
|
||||||
|
|
Loading…
Reference in New Issue