modify metrics collector so it uses standard gauges

2023-11-08 12:25:19 -03:00 · 2023-11-08 12:25:19 -03:00 · 29d36b51f2
parent b50469f6cc
commit 29d36b51f2
3 changed files with 106 additions and 94 deletions
--- a/codex/utils/asyncprofiler/asyncprofiler.nim
+++ b/codex/utils/asyncprofiler/asyncprofiler.nim
@ -31,8 +31,8 @@ type
  MetricsSummary* = Table[ptr SrcLoc, OverallMetrics]
 var
-  perFutureMetrics: Table[uint, FutureMetrics]
+  perFutureMetrics {.threadvar.}: Table[uint, FutureMetrics]
-  futureSummaryMetrics: MetricsSummary
+  futureSummaryMetrics {.threadvar.}: MetricsSummary
 proc getFutureSummaryMetrics*(): MetricsSummary {.gcsafe.} =
  ## get a copy of the table of summary metrics for all futures.
--- a/codex/utils/asyncprofiler/metricscollector.nim
+++ b/codex/utils/asyncprofiler/metricscollector.nim
@ -8,83 +8,104 @@ import metrics
 when defined(metrics):
  type
-    AsyncProfilerInfo* = ref object of Gauge
+    AsyncProfilerInfo* = ref object of RootObj
      perfSampler: PerfSampler
      sampleInterval: int
      k: int
    PerfSampler = proc (): MetricsSummary {.raises: [].}
    ProfilerMetric = (SrcLoc, OverallMetrics)
  const locationLabels = ["proc", "file", "line"]
  # Per-proc Metrics
  declarePublicGauge(
    chronos_exec_time_total,
    "total time in which this proc actively occupied the event loop thread",
    labels = locationLabels,
  )
  declarePublicGauge(
    chronos_run_time_total,
    "chronos_exec_time_total of this proc plus of all its children (procs" &
    "that this proc called and awaited for)",
    labels = locationLabels,
  )
  declarePublicGauge(
    chronos_wall_time_total,
    "the amount of time elapsed from when the async proc was started to when" &
    "it completed",
    labels = locationLabels,
  )
  declarePublicGauge(
    chronos_call_count_total,
    "the total number of times this async proc was called and completed",
    labels = locationLabels,
  )
  # Per-proc Statistics
  declarePublicGauge(
    chronos_single_exec_time_max,
    "the maximum execution time for a single call of this proc",
    labels = locationLabels,
  )
  # Global Statistics
  declarePublicGauge(
    chronos_largest_exec_time_total,
    "the largest chronos_exec_time_total of all procs",
  )
  declarePublicGauge(
    chronos_largest_exec_time_max,
    "the largest chronos_single_exec_time_max of all procs",
  )
  proc newCollector*(
    AsyncProfilerInfo: typedesc,
    name: string,
    help: string,
    perfSampler: PerfSampler,
    k: int = 10,
-    registry: Registry = defaultRegistry,
+  ): AsyncProfilerInfo = AsyncProfilerInfo(perfSampler: perfSampler, k: k)
  ): AsyncProfilerInfo =
    result = AsyncProfilerInfo.newCollector(
      name = name, help = help, registry = registry)
    result.perfSampler = perfSampler
    result.k = k
  proc metricValue(duration: timer.Duration):
    float64 = duration.nanoseconds.float64
  proc collectSlowestProcs(
    self: AsyncProfilerInfo,
    profilerMetrics: seq[ProfilerMetric],
    prometheusMetrics: var Metrics,
    timestampMillis: int64,
    k: int,
  ): void =
    const locationLabelsKeys = @["proc", "file", "line"]
    for (i, pair) in enumerate(profilerMetrics):
      if i == k:
        break
      let (location, metrics) = pair
-      proc addLabeledMetric(name: string,
+      let locationLabels = @[
-        value: timer.Duration,
+        $(location.procedure),
-        prometheusMetrics: var Metrics): void =
+        $(location.file),
-        let labelValues = @[
+        $(location.line),
-          $(location.procedure),
+      ]
          $(location.file),
          $(location.line),
        ]
-        var procMetrics = prometheusMetrics.mGetOrPut(labelValues, @[])
+      chronos_exec_time_total.set(metrics.totalExecTime.nanoseconds,
        labelValues = locationLabels)
-        procMetrics.add(
+      chronos_run_time_total.set(metrics.totalRunTime.nanoseconds,
-          Metric(
+        labelValues = locationLabels)
            name: name,
            value: value.metricValue(),
            labels: locationLabelsKeys,
            labelValues: labelValues,
          )
        )
-        # If you don't reassign, your modifications are simply lost due to nim's
+      chronos_wall_time_total.set(metrics.totalWallTime.nanoseconds,
-        # weird var semantics.
+        labelValues = locationLabels)
        prometheusMetrics[labelValues] = procMetrics
-      addLabeledMetric(
+      chronos_single_exec_time_max.set(metrics.maxSingleTime.nanoseconds,
-        "chronos_exec_time_total", metrics.totalExecTime, prometheusMetrics)
+        labelValues = locationLabels)
-      addLabeledMetric(
+
-        "chronos_run_time_total", metrics.totalRunTime, prometheusMetrics)
+      chronos_call_count_total.set(metrics.count, labelValues = locationLabels)
      addLabeledMetric(
        "chronos_wall_time_total", metrics.totalWallTime, prometheusMetrics)
      addLabeledMetric(
        "chronos_single_exec_time_max", metrics.maxSingleTime, prometheusMetrics)
  proc collectOutlierMetrics(
    self: AsyncProfilerInfo,
    profilerMetrics: seq[ProfilerMetric],
    prometheusMetrics: var Metrics,
    timestampMillis: int64,
  ): void =
    ## Adds summary metrics for the procs that have the highest exec time
@ -100,24 +121,12 @@ when defined(metrics):
      if metric.totalExecTime > largestExecTime:
        largestExecTime = metric.totalExecTime
-    prometheusMetrics[@[]].add(Metric(
+    chronos_largest_exec_time_total.set(largestExecTime.nanoseconds)
-      name: "chronos_largest_exec_time_total",
+    chronos_largest_exec_time_max.set(largestMaxExecTime.nanoseconds)
      value: largestExecTime.metricValue(),
      timestamp: timestampMillis,
    ))
-    prometheusMetrics[@[]].add(Metric(
+  proc collect*(self: AsyncProfilerInfo): void =
      name: "chronos_largest_exec_time_max",
      value: largestMaxExecTime.metricValue(),
      timestamp: timestampMillis,
    ))
  method collect*(self: AsyncProfilerInfo): Metrics =
    let now = times.getTime().toMilliseconds()
    var prometheusMetrics = Metrics()
    prometheusMetrics[@[]] = newSeq[Metric]()
    var currentMetrics = self.
      perfSampler().
      pairs.
@ -132,14 +141,25 @@ when defined(metrics):
        order = SortOrder.Descending
      )
-    # otherwise the compiler keeps complaining of a phantom KeyError
+    self.collectOutlierMetrics(currentMetrics, now)
-    {.cast(raises:[]).}:
+    self.collectSlowestProcs(currentMetrics, now, self.k)
      self.collectOutlierMetrics(currentMetrics, prometheusMetrics, now)
      self.collectSlowestProcs(currentMetrics, prometheusMetrics, now, self.k)
-    prometheusMetrics
+  proc resetMetric(gauge: Gauge): void =
    # We try to be as conservative as possible and not write directly to
    # internal state. We do need to read from it, though.
    for labelValues in gauge.metrics.keys:
      gauge.set(0.int64, labelValues = labelValues)
-  var asyncProfilerInfo* {.global.} = AsyncProfilerInfo.newCollector(
+  proc reset*(self: AsyncProfilerInfo): void =
-    "async_profiler_info", "Async profiler info",
+    resetMetric(chronos_exec_time_total)
-    perfSampler = getFutureSummaryMetrics
+    resetMetric(chronos_run_time_total)
-  )
+    resetMetric(chronos_wall_time_total)
    resetMetric(chronos_call_count_total)
    resetMetric(chronos_single_exec_time_max)
    resetMetric(chronos_largest_exec_time_total)
    resetMetric(chronos_largest_exec_time_max)
  proc run*(self: AsyncProfilerInfo) {.async.} =
    while true:
      self.collect()
      await sleepAsync(timer.milliseconds(self.sampleInterval))
--- a/tests/codex/utils/asyncprofiler/testmetricscollector.nim
+++ b/tests/codex/utils/asyncprofiler/testmetricscollector.nim
@ -6,7 +6,7 @@ import codex/utils/asyncprofiler
 import ../../helpers
-checksuite "asyncprofiler metrics collector":
+suite "asyncprofiler metrics collector":
  var locations = @[
    SrcLoc(procedure: "start", file: "discovery.nim", line: 174),
@ -50,36 +50,28 @@ checksuite "asyncprofiler metrics collector":
    ),
  }.toTable
-  test "should keep track of basic worst-case exec time stats":
+  var collector = AsyncProfilerInfo.newCollector(
-    var registry = newRegistry()
+    perfSampler = proc (): MetricsSummary = sample,
-    var collector = AsyncProfilerInfo.newCollector(
+    k = 3,
-      name = "profiling_metrics",
+  )
      help = "Metrics from the profiler",
      registry = registry,
      perfSampler = proc (): MetricsSummary = sample
    )
-    check collector.valueByName("chronos_largest_exec_time_total") == 91660
+  setup:
-    check collector.valueByName("chronos_largest_exec_time_max") == 81660
+    collector.reset()
    collector.collect()
  test "should keep track of basic worst-case exec time stats":
    check chronos_largest_exec_time_total.value == 91660
    check chronos_largest_exec_time_max.value == 81660
  test "should create labeled series for the k slowest procs in terms of totalExecTime":
-    var registry = newRegistry()
+    check chronos_exec_time_total.value(
    var collector = AsyncProfilerInfo.newCollector(
      name = "profiling_metrics",
      help = "Metrics from the profiler",
      registry = registry,
      k = 3,
      perfSampler = proc (): MetricsSummary = sample
    )
    check collector.valueByName("chronos_exec_time_total",
      labelValues = @["start", "discovery.nim", "192"]) == 91660
-    check collector.valueByName("chronos_exec_time_total",
+    check chronos_exec_time_total.value(
      labelValues = @["start", "discovery.nim", "174"]) == 90062
-    check collector.valueByName("chronos_exec_time_total",
+    check chronos_exec_time_total.value(
      labelValues = @["update", "sqliteds.nim", "107"]) == 60645
    # This is out of the top-k slowest, so should not have been recorded.
    expect system.KeyError:
-      discard collector.valueByName("chronos_exec_time_total",
+      discard chronos_exec_time_total.value(
        labelValues = @["query", "manager.nim", "323"])