diff --git a/codex/utils/asyncprofiler/asyncprofiler.nim b/codex/utils/asyncprofiler/asyncprofiler.nim index 711efa84..824eb559 100644 --- a/codex/utils/asyncprofiler/asyncprofiler.nim +++ b/codex/utils/asyncprofiler/asyncprofiler.nim @@ -31,8 +31,8 @@ type MetricsSummary* = Table[ptr SrcLoc, OverallMetrics] var - perFutureMetrics: Table[uint, FutureMetrics] - futureSummaryMetrics: MetricsSummary + perFutureMetrics {.threadvar.}: Table[uint, FutureMetrics] + futureSummaryMetrics {.threadvar.}: MetricsSummary proc getFutureSummaryMetrics*(): MetricsSummary {.gcsafe.} = ## get a copy of the table of summary metrics for all futures. diff --git a/codex/utils/asyncprofiler/metricscollector.nim b/codex/utils/asyncprofiler/metricscollector.nim index d46cceba..483fa121 100644 --- a/codex/utils/asyncprofiler/metricscollector.nim +++ b/codex/utils/asyncprofiler/metricscollector.nim @@ -8,83 +8,104 @@ import metrics when defined(metrics): type - AsyncProfilerInfo* = ref object of Gauge + AsyncProfilerInfo* = ref object of RootObj perfSampler: PerfSampler + sampleInterval: int k: int PerfSampler = proc (): MetricsSummary {.raises: [].} ProfilerMetric = (SrcLoc, OverallMetrics) + const locationLabels = ["proc", "file", "line"] + + # Per-proc Metrics + declarePublicGauge( + chronos_exec_time_total, + "total time in which this proc actively occupied the event loop thread", + labels = locationLabels, + ) + + declarePublicGauge( + chronos_run_time_total, + "chronos_exec_time_total of this proc plus of all its children (procs" & + "that this proc called and awaited for)", + labels = locationLabels, + ) + + declarePublicGauge( + chronos_wall_time_total, + "the amount of time elapsed from when the async proc was started to when" & + "it completed", + labels = locationLabels, + ) + + declarePublicGauge( + chronos_call_count_total, + "the total number of times this async proc was called and completed", + labels = locationLabels, + ) + + # Per-proc Statistics + declarePublicGauge( + chronos_single_exec_time_max, + "the maximum execution time for a single call of this proc", + labels = locationLabels, + ) + + # Global Statistics + declarePublicGauge( + chronos_largest_exec_time_total, + "the largest chronos_exec_time_total of all procs", + ) + + declarePublicGauge( + chronos_largest_exec_time_max, + "the largest chronos_single_exec_time_max of all procs", + ) + proc newCollector*( AsyncProfilerInfo: typedesc, - name: string, - help: string, perfSampler: PerfSampler, k: int = 10, - registry: Registry = defaultRegistry, - ): AsyncProfilerInfo = - result = AsyncProfilerInfo.newCollector( - name = name, help = help, registry = registry) - result.perfSampler = perfSampler - result.k = k - - proc metricValue(duration: timer.Duration): - float64 = duration.nanoseconds.float64 + ): AsyncProfilerInfo = AsyncProfilerInfo(perfSampler: perfSampler, k: k) proc collectSlowestProcs( self: AsyncProfilerInfo, profilerMetrics: seq[ProfilerMetric], - prometheusMetrics: var Metrics, timestampMillis: int64, k: int, ): void = - const locationLabelsKeys = @["proc", "file", "line"] - for (i, pair) in enumerate(profilerMetrics): if i == k: break let (location, metrics) = pair - proc addLabeledMetric(name: string, - value: timer.Duration, - prometheusMetrics: var Metrics): void = - let labelValues = @[ - $(location.procedure), - $(location.file), - $(location.line), - ] + let locationLabels = @[ + $(location.procedure), + $(location.file), + $(location.line), + ] - var procMetrics = prometheusMetrics.mGetOrPut(labelValues, @[]) + chronos_exec_time_total.set(metrics.totalExecTime.nanoseconds, + labelValues = locationLabels) - procMetrics.add( - Metric( - name: name, - value: value.metricValue(), - labels: locationLabelsKeys, - labelValues: labelValues, - ) - ) + chronos_run_time_total.set(metrics.totalRunTime.nanoseconds, + labelValues = locationLabels) - # If you don't reassign, your modifications are simply lost due to nim's - # weird var semantics. - prometheusMetrics[labelValues] = procMetrics + chronos_wall_time_total.set(metrics.totalWallTime.nanoseconds, + labelValues = locationLabels) - addLabeledMetric( - "chronos_exec_time_total", metrics.totalExecTime, prometheusMetrics) - addLabeledMetric( - "chronos_run_time_total", metrics.totalRunTime, prometheusMetrics) - addLabeledMetric( - "chronos_wall_time_total", metrics.totalWallTime, prometheusMetrics) - addLabeledMetric( - "chronos_single_exec_time_max", metrics.maxSingleTime, prometheusMetrics) + chronos_single_exec_time_max.set(metrics.maxSingleTime.nanoseconds, + labelValues = locationLabels) + + chronos_call_count_total.set(metrics.count, labelValues = locationLabels) proc collectOutlierMetrics( self: AsyncProfilerInfo, profilerMetrics: seq[ProfilerMetric], - prometheusMetrics: var Metrics, timestampMillis: int64, ): void = ## Adds summary metrics for the procs that have the highest exec time @@ -100,24 +121,12 @@ when defined(metrics): if metric.totalExecTime > largestExecTime: largestExecTime = metric.totalExecTime - prometheusMetrics[@[]].add(Metric( - name: "chronos_largest_exec_time_total", - value: largestExecTime.metricValue(), - timestamp: timestampMillis, - )) + chronos_largest_exec_time_total.set(largestExecTime.nanoseconds) + chronos_largest_exec_time_max.set(largestMaxExecTime.nanoseconds) - prometheusMetrics[@[]].add(Metric( - name: "chronos_largest_exec_time_max", - value: largestMaxExecTime.metricValue(), - timestamp: timestampMillis, - )) - - method collect*(self: AsyncProfilerInfo): Metrics = + proc collect*(self: AsyncProfilerInfo): void = let now = times.getTime().toMilliseconds() - var prometheusMetrics = Metrics() - prometheusMetrics[@[]] = newSeq[Metric]() - var currentMetrics = self. perfSampler(). pairs. @@ -132,14 +141,25 @@ when defined(metrics): order = SortOrder.Descending ) - # otherwise the compiler keeps complaining of a phantom KeyError - {.cast(raises:[]).}: - self.collectOutlierMetrics(currentMetrics, prometheusMetrics, now) - self.collectSlowestProcs(currentMetrics, prometheusMetrics, now, self.k) + self.collectOutlierMetrics(currentMetrics, now) + self.collectSlowestProcs(currentMetrics, now, self.k) - prometheusMetrics + proc resetMetric(gauge: Gauge): void = + # We try to be as conservative as possible and not write directly to + # internal state. We do need to read from it, though. + for labelValues in gauge.metrics.keys: + gauge.set(0.int64, labelValues = labelValues) - var asyncProfilerInfo* {.global.} = AsyncProfilerInfo.newCollector( - "async_profiler_info", "Async profiler info", - perfSampler = getFutureSummaryMetrics - ) + proc reset*(self: AsyncProfilerInfo): void = + resetMetric(chronos_exec_time_total) + resetMetric(chronos_run_time_total) + resetMetric(chronos_wall_time_total) + resetMetric(chronos_call_count_total) + resetMetric(chronos_single_exec_time_max) + resetMetric(chronos_largest_exec_time_total) + resetMetric(chronos_largest_exec_time_max) + + proc run*(self: AsyncProfilerInfo) {.async.} = + while true: + self.collect() + await sleepAsync(timer.milliseconds(self.sampleInterval)) diff --git a/tests/codex/utils/asyncprofiler/testmetricscollector.nim b/tests/codex/utils/asyncprofiler/testmetricscollector.nim index 8cf2151e..cd171016 100644 --- a/tests/codex/utils/asyncprofiler/testmetricscollector.nim +++ b/tests/codex/utils/asyncprofiler/testmetricscollector.nim @@ -6,7 +6,7 @@ import codex/utils/asyncprofiler import ../../helpers -checksuite "asyncprofiler metrics collector": +suite "asyncprofiler metrics collector": var locations = @[ SrcLoc(procedure: "start", file: "discovery.nim", line: 174), @@ -50,36 +50,28 @@ checksuite "asyncprofiler metrics collector": ), }.toTable - test "should keep track of basic worst-case exec time stats": - var registry = newRegistry() - var collector = AsyncProfilerInfo.newCollector( - name = "profiling_metrics", - help = "Metrics from the profiler", - registry = registry, - perfSampler = proc (): MetricsSummary = sample - ) + var collector = AsyncProfilerInfo.newCollector( + perfSampler = proc (): MetricsSummary = sample, + k = 3, + ) - check collector.valueByName("chronos_largest_exec_time_total") == 91660 - check collector.valueByName("chronos_largest_exec_time_max") == 81660 + setup: + collector.reset() + collector.collect() + + test "should keep track of basic worst-case exec time stats": + check chronos_largest_exec_time_total.value == 91660 + check chronos_largest_exec_time_max.value == 81660 test "should create labeled series for the k slowest procs in terms of totalExecTime": - var registry = newRegistry() - var collector = AsyncProfilerInfo.newCollector( - name = "profiling_metrics", - help = "Metrics from the profiler", - registry = registry, - k = 3, - perfSampler = proc (): MetricsSummary = sample - ) - - check collector.valueByName("chronos_exec_time_total", + check chronos_exec_time_total.value( labelValues = @["start", "discovery.nim", "192"]) == 91660 - check collector.valueByName("chronos_exec_time_total", + check chronos_exec_time_total.value( labelValues = @["start", "discovery.nim", "174"]) == 90062 - check collector.valueByName("chronos_exec_time_total", + check chronos_exec_time_total.value( labelValues = @["update", "sqliteds.nim", "107"]) == 60645 # This is out of the top-k slowest, so should not have been recorded. expect system.KeyError: - discard collector.valueByName("chronos_exec_time_total", + discard chronos_exec_time_total.value( labelValues = @["query", "manager.nim", "323"])