nim-codex/codex/utils/asyncprofiler/metricscollector.nim

import std/algorithm
import std/enumerate
import std/sequtils
import std/times

import asyncprofiler
import metrics

when defined(metrics):
  type
    AsyncProfilerInfo* = ref object of RootObj
      perfSampler: PerfSampler
      sampleInterval: times.Duration
      clock: Clock
      k: int
      init: bool
      lastSample: Time
      collections*: uint

    PerfSampler = proc (): MetricsSummary {.raises: [].}

    Clock = proc (): Time {.raises: [].}

    ProfilerMetric = (SrcLoc, OverallMetrics)

  const locationLabels = ["proc", "file", "line"]

  # Per-proc Metrics
  declarePublicGauge(
    chronos_exec_time_total,
    "total time in which this proc actively occupied the event loop thread",
    labels = locationLabels,
  )

  declarePublicGauge(
    chronos_run_time_total,
    "chronos_exec_time_total of this proc plus of all its children (procs" &
    "that this proc called and awaited for)",
    labels = locationLabels,
  )

  declarePublicGauge(
    chronos_wall_time_total,
    "the amount of time elapsed from when the async proc was started to when" &
    "it completed",
    labels = locationLabels,
  )

  declarePublicGauge(
    chronos_call_count_total,
    "the total number of times this async proc was called and completed",
    labels = locationLabels,
  )

  # Per-proc Statistics
  declarePublicGauge(
    chronos_single_exec_time_max,
    "the maximum execution time for a single call of this proc",
    labels = locationLabels,
  )

  # Global Statistics
  declarePublicGauge(
    chronos_largest_exec_time_total,
    "the largest chronos_exec_time_total of all procs",
  )

  declarePublicGauge(
    chronos_largest_exec_time_max,
    "the largest chronos_single_exec_time_max of all procs",
  )

  # Keeps track of the thread initializing the module. This is the only thread
  # that will be allowed to interact with the metrics collector.
  let moduleInitThread = getThreadId()

  proc newCollector*(
    AsyncProfilerInfo: typedesc,
    perfSampler: PerfSampler,
    clock: Clock,
    sampleInterval: times.Duration,
    k: int = 10,
  ): AsyncProfilerInfo = AsyncProfilerInfo(
    perfSampler: perfSampler,
    clock: clock,
    k: k,
    sampleInterval: sampleInterval,
    init: true,
    lastSample: low(Time),
  )

  proc collectSlowestProcs(
    self: AsyncProfilerInfo,
    profilerMetrics: seq[ProfilerMetric],
    timestampMillis: int64,
    k: int,
  ): void =

    for (i, pair) in enumerate(profilerMetrics):
      if i == k:
        break

      let (location, metrics) = pair

      let locationLabels = @[
        $(location.procedure),
        $(location.file),
        $(location.line),
      ]

      chronos_exec_time_total.set(metrics.totalExecTime.nanoseconds,
        labelValues = locationLabels)

      chronos_run_time_total.set(metrics.totalRunTime.nanoseconds,
        labelValues = locationLabels)

      chronos_wall_time_total.set(metrics.totalWallTime.nanoseconds,
        labelValues = locationLabels)

      chronos_single_exec_time_max.set(metrics.maxSingleTime.nanoseconds,
        labelValues = locationLabels)

      chronos_call_count_total.set(metrics.count, labelValues = locationLabels)

  proc collectOutlierMetrics(
    self: AsyncProfilerInfo,
    profilerMetrics: seq[ProfilerMetric],
    timestampMillis: int64,
  ): void =
    ## Adds summary metrics for the procs that have the highest exec time
    ## (which stops the async loop) and the highest max exec time. This can
    ## help spot outliers.

    var largestExecTime = low(timer.Duration)
    var largestMaxExecTime = low(timer.Duration)

    for (_, metric) in profilerMetrics:
      if metric.maxSingleTime > largestMaxExecTime:
        largestMaxExecTime = metric.maxSingleTime
      if metric.totalExecTime > largestExecTime:
        largestExecTime = metric.totalExecTime

    chronos_largest_exec_time_total.set(largestExecTime.nanoseconds)
    chronos_largest_exec_time_max.set(largestMaxExecTime.nanoseconds)

  proc collect*(self: AsyncProfilerInfo, force: bool = false): void =
    # Calling this method from the wrong thread has happened a lot in the past,
    # so this makes sure we're not doing anything funny.
    assert getThreadId() == moduleInitThread, "You cannot call collect() from" &
      " a thread other than the one that initialized the metricscolletor module"

    let now = self.clock()
    if not force and (now - self.lastSample < self.sampleInterval):
      return

    self.collections += 1
    var currentMetrics = self.
      perfSampler().
      pairs.
      toSeq.
      map(
        proc (pair: (ptr SrcLoc, OverallMetrics)): ProfilerMetric =
          (pair[0][], pair[1])
      ).
      # We don't scoop metrics with 0 exec time as we have a limited number of
      # prometheus slots, and those are less likely to be useful in debugging
      # Chronos performance issues.
      filter(
        proc (pair: ProfilerMetric): bool =
          pair[1].totalExecTime.nanoseconds > 0
      ).
      sorted(
        proc (a, b: ProfilerMetric): int =
          cmp(a[1].totalExecTime, b[1].totalExecTime),
        order = SortOrder.Descending
      )

    self.collectOutlierMetrics(currentMetrics, now.toMilliseconds())
    self.collectSlowestProcs(currentMetrics, now.toMilliseconds(), self.k)

    self.lastSample = now

  proc resetMetric(gauge: Gauge): void =
    # We try to be as conservative as possible and not write directly to
    # internal state. We do need to read from it, though.
    for labelValues in gauge.metrics.keys:
      gauge.set(0.int64, labelValues = labelValues)

  proc reset*(self: AsyncProfilerInfo): void =
    resetMetric(chronos_exec_time_total)
    resetMetric(chronos_run_time_total)
    resetMetric(chronos_wall_time_total)
    resetMetric(chronos_call_count_total)
    resetMetric(chronos_single_exec_time_max)
    resetMetric(chronos_largest_exec_time_total)
    resetMetric(chronos_largest_exec_time_max)

  var asyncProfilerInfo* {.global.}: AsyncProfilerInfo

  proc initDefault*(AsyncProfilerInfo: typedesc, k: int) =
    assert getThreadId() == moduleInitThread, "You cannot call " &
      "initDefault() from a thread other than the one that initialized the " &
      "metricscolletor module."

    asyncProfilerInfo = AsyncProfilerInfo.newCollector(
      perfSampler = proc (): MetricsSummary = profiler.getFutureSummaryMetrics(),
      k = k,
      # We want to collect metrics every 5 seconds.
      sampleInterval = initDuration(seconds = 5),
      clock = proc (): Time = getTime(),
    )

    profiler.setChangeCallback(proc (): void = asyncProfilerInfo.collect())
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00			`import std/algorithm`
			`import std/enumerate`
			`import std/sequtils`
add basic prometheus profiling metrics tracker 2023-11-03 19:04:37 -03:00			`import std/times`

			`import asyncprofiler`
			`import metrics`

			`when defined(metrics):`
			`type`
modify metrics collector so it uses standard gauges 2023-11-08 12:25:19 -03:00			`AsyncProfilerInfo* = ref object of RootObj`
add basic prometheus profiling metrics tracker 2023-11-03 19:04:37 -03:00			`perfSampler: PerfSampler`
add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00			`sampleInterval: times.Duration`
			`clock: Clock`
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00			`k: int`
add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00			`init: bool`
			`lastSample: Time`
			`collections*: uint`
add basic prometheus profiling metrics tracker 2023-11-03 19:04:37 -03:00
			`PerfSampler = proc (): MetricsSummary {.raises: [].}`

revert threadvars and add initialization to collector global instance 2023-11-08 19:53:26 -03:00			`Clock = proc (): Time {.raises: [].}`
add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00			`ProfilerMetric = (SrcLoc, OverallMetrics)`

modify metrics collector so it uses standard gauges 2023-11-08 12:25:19 -03:00			`const locationLabels = ["proc", "file", "line"]`

			`# Per-proc Metrics`
			`declarePublicGauge(`
			`chronos_exec_time_total,`
			`"total time in which this proc actively occupied the event loop thread",`
			`labels = locationLabels,`
			`)`

			`declarePublicGauge(`
			`chronos_run_time_total,`
			`"chronos_exec_time_total of this proc plus of all its children (procs" &`
			`"that this proc called and awaited for)",`
			`labels = locationLabels,`
			`)`

			`declarePublicGauge(`
			`chronos_wall_time_total,`
			`"the amount of time elapsed from when the async proc was started to when" &`
			`"it completed",`
			`labels = locationLabels,`
			`)`

			`declarePublicGauge(`
			`chronos_call_count_total,`
			`"the total number of times this async proc was called and completed",`
			`labels = locationLabels,`
			`)`

			`# Per-proc Statistics`
			`declarePublicGauge(`
			`chronos_single_exec_time_max,`
			`"the maximum execution time for a single call of this proc",`
			`labels = locationLabels,`
			`)`

			`# Global Statistics`
			`declarePublicGauge(`
			`chronos_largest_exec_time_total,`
			`"the largest chronos_exec_time_total of all procs",`
			`)`

			`declarePublicGauge(`
			`chronos_largest_exec_time_max,`
			`"the largest chronos_single_exec_time_max of all procs",`
			`)`

add some hacks to allow enabling profiling on specific threads, and guarding against enabling it on multiple 2023-11-09 17:07:11 -03:00			`# Keeps track of the thread initializing the module. This is the only thread`
			`# that will be allowed to interact with the metrics collector.`
add assertions to try to track threading errors 2023-11-09 14:35:50 -03:00			`let moduleInitThread = getThreadId()`

add basic prometheus profiling metrics tracker 2023-11-03 19:04:37 -03:00			`proc newCollector*(`
add tests to main test suite, add global async profiler info collector 2023-11-06 18:44:08 -03:00			`AsyncProfilerInfo: typedesc,`
add basic prometheus profiling metrics tracker 2023-11-03 19:04:37 -03:00			`perfSampler: PerfSampler,`
add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00			`clock: Clock,`
			`sampleInterval: times.Duration,`
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00			`k: int = 10,`
add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00			`): AsyncProfilerInfo = AsyncProfilerInfo(`
			`perfSampler: perfSampler,`
			`clock: clock,`
			`k: k,`
			`sampleInterval: sampleInterval,`
			`init: true,`
			`lastSample: low(Time),`
			`)`
add basic prometheus profiling metrics tracker 2023-11-03 19:04:37 -03:00
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00			`proc collectSlowestProcs(`
add tests to main test suite, add global async profiler info collector 2023-11-06 18:44:08 -03:00			`self: AsyncProfilerInfo,`
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00			`profilerMetrics: seq[ProfilerMetric],`
			`timestampMillis: int64,`
			`k: int,`
			`): void =`

			`for (i, pair) in enumerate(profilerMetrics):`
			`if i == k:`
			`break`

			`let (location, metrics) = pair`

modify metrics collector so it uses standard gauges 2023-11-08 12:25:19 -03:00			`let locationLabels = @[`
			`$(location.procedure),`
			`$(location.file),`
			`$(location.line),`
			`]`

			`chronos_exec_time_total.set(metrics.totalExecTime.nanoseconds,`
			`labelValues = locationLabels)`

			`chronos_run_time_total.set(metrics.totalRunTime.nanoseconds,`
			`labelValues = locationLabels)`

			`chronos_wall_time_total.set(metrics.totalWallTime.nanoseconds,`
			`labelValues = locationLabels)`

			`chronos_single_exec_time_max.set(metrics.maxSingleTime.nanoseconds,`
			`labelValues = locationLabels)`

			`chronos_call_count_total.set(metrics.count, labelValues = locationLabels)`
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00
			`proc collectOutlierMetrics(`
add tests to main test suite, add global async profiler info collector 2023-11-06 18:44:08 -03:00			`self: AsyncProfilerInfo,`
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00			`profilerMetrics: seq[ProfilerMetric],`
			`timestampMillis: int64,`
			`): void =`
			`## Adds summary metrics for the procs that have the highest exec time`
			`## (which stops the async loop) and the highest max exec time. This can`
			`## help spot outliers.`
add basic prometheus profiling metrics tracker 2023-11-03 19:04:37 -03:00
			`var largestExecTime = low(timer.Duration)`
			`var largestMaxExecTime = low(timer.Duration)`
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00
			`for (_, metric) in profilerMetrics:`
add basic prometheus profiling metrics tracker 2023-11-03 19:04:37 -03:00			`if metric.maxSingleTime > largestMaxExecTime:`
			`largestMaxExecTime = metric.maxSingleTime`
			`if metric.totalExecTime > largestExecTime:`
			`largestExecTime = metric.totalExecTime`

modify metrics collector so it uses standard gauges 2023-11-08 12:25:19 -03:00			`chronos_largest_exec_time_total.set(largestExecTime.nanoseconds)`
			`chronos_largest_exec_time_max.set(largestMaxExecTime.nanoseconds)`
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00
add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00			`proc collect*(self: AsyncProfilerInfo, force: bool = false): void =`
add some hacks to allow enabling profiling on specific threads, and guarding against enabling it on multiple 2023-11-09 17:07:11 -03:00			`# Calling this method from the wrong thread has happened a lot in the past,`
			`# so this makes sure we're not doing anything funny.`
			`assert getThreadId() == moduleInitThread, "You cannot call collect() from" &`
			`" a thread other than the one that initialized the metricscolletor module"`
add assertions to try to track threading errors 2023-11-09 14:35:50 -03:00
add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00			`let now = self.clock()`
			`if not force and (now - self.lastSample < self.sampleInterval):`
			`return`

			`self.collections += 1`
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00			`var currentMetrics = self.`
			`perfSampler().`
			`pairs.`
			`toSeq.`
			`map(`
			`proc (pair: (ptr SrcLoc, OverallMetrics)): ProfilerMetric =`
			`(pair[0][], pair[1])`
			`).`
add some hacks to allow enabling profiling on specific threads, and guarding against enabling it on multiple 2023-11-09 17:07:11 -03:00			`# We don't scoop metrics with 0 exec time as we have a limited number of`
			`# prometheus slots, and those are less likely to be useful in debugging`
			`# Chronos performance issues.`
			`filter(`
			`proc (pair: ProfilerMetric): bool =`
			`pair[1].totalExecTime.nanoseconds > 0`
			`).`
add labeled top-k slowest async procs to prometheus collector 2023-11-06 15:10:13 -03:00			`sorted(`
			`proc (a, b: ProfilerMetric): int =`
			`cmp(a[1].totalExecTime, b[1].totalExecTime),`
			`order = SortOrder.Descending`
add basic prometheus profiling metrics tracker 2023-11-03 19:04:37 -03:00			`)`

add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00			`self.collectOutlierMetrics(currentMetrics, now.toMilliseconds())`
			`self.collectSlowestProcs(currentMetrics, now.toMilliseconds(), self.k)`

			`self.lastSample = now`
modify metrics collector so it uses standard gauges 2023-11-08 12:25:19 -03:00
			`proc resetMetric(gauge: Gauge): void =`
			`# We try to be as conservative as possible and not write directly to`
			`# internal state. We do need to read from it, though.`
			`for labelValues in gauge.metrics.keys:`
			`gauge.set(0.int64, labelValues = labelValues)`

			`proc reset*(self: AsyncProfilerInfo): void =`
			`resetMetric(chronos_exec_time_total)`
			`resetMetric(chronos_run_time_total)`
			`resetMetric(chronos_wall_time_total)`
			`resetMetric(chronos_call_count_total)`
			`resetMetric(chronos_single_exec_time_max)`
			`resetMetric(chronos_largest_exec_time_total)`
			`resetMetric(chronos_largest_exec_time_max)`

allow configuration of profiler output volume from CLI option 2023-11-09 12:42:15 -03:00			`var asyncProfilerInfo* {.global.}: AsyncProfilerInfo`

			`proc initDefault*(AsyncProfilerInfo: typedesc, k: int) =`
add some hacks to allow enabling profiling on specific threads, and guarding against enabling it on multiple 2023-11-09 17:07:11 -03:00			`assert getThreadId() == moduleInitThread, "You cannot call " &`
			`"initDefault() from a thread other than the one that initialized the " &`
			`"metricscolletor module."`
add assertions to try to track threading errors 2023-11-09 14:35:50 -03:00
allow configuration of profiler output volume from CLI option 2023-11-09 12:42:15 -03:00			`asyncProfilerInfo = AsyncProfilerInfo.newCollector(`
refactor asyncprofiler to make it more testable 2023-11-16 12:30:46 -03:00			`perfSampler = proc (): MetricsSummary = profiler.getFutureSummaryMetrics(),`
allow configuration of profiler output volume from CLI option 2023-11-09 12:42:15 -03:00			`k = k,`
add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00			`# We want to collect metrics every 5 seconds.`
revert threadvars and add initialization to collector global instance 2023-11-08 19:53:26 -03:00			`sampleInterval = initDuration(seconds = 5),`
			`clock = proc (): Time = getTime(),`
add callback to eliminate the need for an async timer in metric updates 2023-11-08 14:29:30 -03:00			`)`

refactor asyncprofiler to make it more testable 2023-11-16 12:30:46 -03:00			`profiler.setChangeCallback(proc (): void = asyncProfilerInfo.collect())`