nim-chroprof/chroprof/collector.nim

## Metrics collector which allows exporting Chronos profiling metrics to
## Prometheus.

import std/algorithm
import std/enumerate
import std/sequtils
import std/tables
import std/times

import chronos/timer
import metrics

import ./api

when defined(metrics):
  type
    ChronosProfilerInfo* = ref object of RootObj
      sampler: MetricsSampler
      sampleInterval: times.Duration
      clock: Clock
      k: int
      init: bool
      lastSample: Time
      collections*: uint

    MetricsSampler = proc(): MetricsTotals {.raises: [].}

    Clock = proc(): Time {.raises: [].}

    FutureMetrics = (FutureType, AggregateMetrics)

  const locationLabels = ["proc", "file", "line"]

  declarePublicGauge(
    chronos_exec_time_total,
    "total time in which this proc actively occupied the event loop thread",
    labels = locationLabels,
  )

  declarePublicGauge(
    chronos_exec_time_with_children_total,
    "chronos_exec_time_with_children_total of this proc plus of all" &
      "its children (procs that this proc called and awaited for)",
    labels = locationLabels,
  )

  declarePublicGauge(
    chronos_wall_time_total,
    "the amount of time elapsed from when the async proc was started to when" &
      "it completed",
    labels = locationLabels,
  )

  declarePublicGauge(
    chronos_call_count_total,
    "the total number of times this async proc was called and completed",
    labels = locationLabels,
  )

  # Per-proc Statistics
  declarePublicGauge(
    chronos_single_exec_time_max,
    "the maximum execution time for a single call of this proc",
    labels = locationLabels,
  )

  proc threadId(): int =
    when defined(getThreadId):
      getThreadId()
    else:
      0

  # Keeps track of the thread initializing the module. This is the only thread
  # that will be allowed to interact with the metrics collector.
  let moduleInitThread = threadId()

  proc newCollector*(
      ChronosProfilerInfo: typedesc,
      sampler: MetricsSampler,
      clock: Clock,
      sampleInterval: times.Duration,
      k: int = 10,
  ): ChronosProfilerInfo =
    ChronosProfilerInfo(
      sampler: sampler,
      clock: clock,
      k: k,
      sampleInterval: sampleInterval,
      init: true,
      lastSample: low(Time),
    )

  proc collectSlowestProcs(
      self: ChronosProfilerInfo,
      profilerMetrics: seq[FutureMetrics],
      timestampMillis: int64,
      k: int,
  ): void =
    for (i, pair) in enumerate(profilerMetrics):
      if i == k:
        break

      let (location, metrics) = pair

      let locationLabels = @[$(location.procedure), $(location.file), $(location.line)]

      chronos_exec_time_total.set(
        metrics.execTime.nanoseconds, labelValues = locationLabels
      )

      chronos_exec_time_with_children_total.set(
        metrics.execTimeWithChildren.nanoseconds, labelValues = locationLabels
      )

      chronos_wall_time_total.set(
        metrics.wallClockTime.nanoseconds, labelValues = locationLabels
      )

      chronos_single_exec_time_max.set(
        metrics.execTimeMax.nanoseconds, labelValues = locationLabels
      )

      chronos_call_count_total.set(
        metrics.callCount.int64, labelValues = locationLabels
      )

  proc collect*(self: ChronosProfilerInfo, force: bool = false): void =
    # Calling this method from the wrong thread has happened a lot in the past,
    # so this makes sure we're not doing anything funny.
    assert threadId() == moduleInitThread,
      "You cannot call collect() from" &
        " a thread other than the one that initialized the metricscolletor module"

    let now = self.clock()
    if not force and (now - self.lastSample < self.sampleInterval):
      return

    self.collections += 1
    var currentMetrics = self
      .sampler().pairs.toSeq
      .
      # We don't scoop metrics with 0 exec time as we have a limited number of
      # prometheus slots, and those are less likely to be useful in debugging
      # Chronos performance issues.
      filter(
        proc(pair: FutureMetrics): bool =
          pair[1].execTimeWithChildren.nanoseconds > 0
      )
      .sorted(
        proc(a, b: FutureMetrics): int =
          cmp(a[1].execTimeWithChildren, b[1].execTimeWithChildren)
        ,
        order = SortOrder.Descending,
      )

    self.collectSlowestProcs(currentMetrics, now.toMilliseconds(), self.k)

    self.lastSample = now

  proc resetMetric(gauge: Gauge): void =
    # We try to be as conservative as possible and not write directly to
    # internal state. We do need to read from it, though.
    for metricSeq in gauge.metrics:
      for metric in metricSeq:
        gauge.set(0.int64, labelValues = metric.labelValues)

  proc reset*(self: ChronosProfilerInfo): void =
    resetMetric(chronos_exec_time_total)
    resetMetric(chronos_exec_time_with_children_total)
    resetMetric(chronos_wall_time_total)
    resetMetric(chronos_call_count_total)
    resetMetric(chronos_single_exec_time_max)

  var asyncProfilerInfo* {.global.}: ChronosProfilerInfo

  proc enableProfilerMetrics*(k: int) =
    assert threadId() == moduleInitThread,
      "You cannot call enableProfilerMetrics() from a thread other than" &
        " the one that initialized the metricscolletor module."

    asyncProfilerInfo = ChronosProfilerInfo.newCollector(
      sampler = getMetrics,
      k = k,
      # We want to collect metrics every 5 seconds.
      sampleInterval = initDuration(seconds = 5),
      clock = proc(): Time =
        getTime()
      ,
    )

    enableProfiling(
      proc(e: Event) {.nimcall, gcsafe, raises: [].} =
        {.cast(gcsafe).}:
          if e.newState == ExtendedFutureState.Completed:
            asyncProfilerInfo.collect()
    )