Adding metrics (#203)

* adding inflight discovery metric

* adding metrics instructions and default dashboard

* spelling fixes
This commit is contained in:
Dmitriy Ryajov 2022-08-23 10:11:21 -06:00 committed by GitHub
parent 3d823dcbc6
commit 4bc701652f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1288 additions and 1 deletions

View File

@ -12,6 +12,7 @@ import std/sequtils
import pkg/chronos import pkg/chronos
import pkg/chronicles import pkg/chronicles
import pkg/libp2p import pkg/libp2p
import pkg/metrics
import ../protobuf/presence import ../protobuf/presence
@ -27,6 +28,8 @@ import ./pendingblocks
logScope: logScope:
topics = "codex discovery engine" topics = "codex discovery engine"
declareGauge(codex_inflight_discovery, "inflight discovery requests")
const const
DefaultConcurrentDiscRequests = 10 DefaultConcurrentDiscRequests = 10
DefaultConcurrentAdvertRequests = 10 DefaultConcurrentAdvertRequests = 10
@ -104,12 +107,15 @@ proc advertiseTaskLoop(b: DiscoveryEngine) {.async.} =
continue continue
try: try:
trace "Advertising block", cid = $cid
let request = b.discovery.provide(cid) let request = b.discovery.provide(cid)
b.inFlightAdvReqs[cid] = request b.inFlightAdvReqs[cid] = request
codex_inflight_discovery.set(b.inFlightAdvReqs.len.int64)
trace "Advertising block", cid = $cid, inflight = b.inFlightAdvReqs.len
await request await request
finally: finally:
b.inFlightAdvReqs.del(cid) b.inFlightAdvReqs.del(cid)
codex_inflight_discovery.set(b.inFlightAdvReqs.len.int64)
trace "Advertised block", cid = $cid, inflight = b.inFlightAdvReqs.len
except CatchableError as exc: except CatchableError as exc:
trace "Exception in advertise task runner", exc = exc.msg trace "Exception in advertise task runner", exc = exc.msg
@ -141,6 +147,7 @@ proc discoveryTaskLoop(b: DiscoveryEngine) {.async.} =
.wait(DefaultDiscoveryTimeout) .wait(DefaultDiscoveryTimeout)
b.inFlightDiscReqs[cid] = request b.inFlightDiscReqs[cid] = request
codex_inflight_discovery.set(b.inFlightAdvReqs.len.int64)
let let
peers = await request peers = await request
@ -149,6 +156,7 @@ proc discoveryTaskLoop(b: DiscoveryEngine) {.async.} =
await allFinished(peers.mapIt( b.network.dialPeer(it.data)))) await allFinished(peers.mapIt( b.network.dialPeer(it.data))))
finally: finally:
b.inFlightDiscReqs.del(cid) b.inFlightDiscReqs.del(cid)
codex_inflight_discovery.set(b.inFlightAdvReqs.len.int64)
except CatchableError as exc: except CatchableError as exc:
trace "Exception in discovery task runner", exc = exc.msg trace "Exception in discovery task runner", exc = exc.msg

43
metrics/README.md Normal file
View File

@ -0,0 +1,43 @@
# Codex Metrics and Dashboard
> This readme should help you to get started with collecting and visualizing metrics exposed by the Codex process.
## Metrics
Metrics are collected using the [nim-metrics](https://github.com/status-im/nim-metrics) backend and should be enabled with the `--metrics` flag. By default metrics are exposed on the `localhost:8008/metrics` end point.
Use the `--metrics-address` and `--metrics-port` flags to to adjust the address and port as necessary.
## General guidelines for adding new metrics
Metrics are useful to monitor the health of the process and should aid in identifying and debugging potential issues that would be hard to notice otherwise.
All Codex metrics should be prefixed with the `codex_` prefix to be able to differentiate from metrics exposed by other subsystems. For example libp2p generally prefixed with the `libp2p_` prefix.
Metrics can be added on an as needed basis, however, keep in mind the potential overhead they might introduce. In particular, be careful with labels as they will generate as many metrics as there are labels for a specific collector. If a metrics or a set of metrics are expensive, it is usually advisable to put them behind a compile time flag.
## Prometheus and Grafana
The exposed metrics can be aggregate by the [Prometheus](https://prometheus.io/) monitoring systems and additionally graphed through [Grafana](https://grafana.com/).
This directory contains both the default `prometheus.yml` config file as well as a basic `codex-grafana-dashboard.json` file that can be augmented with additional panels and metrics on an as needed basis.
Additionally, please consider installing the [node_exporter](https://github.com/prometheus/node_exporter) agent to collect machine level metrics such as overall memory, process, networking, disc IO, etc...
### Using the Grafana dashboard
To use the dashboard open grafana and head to `Dashboards`, hit import in the top rightmost button right next to the `New Dashboard` and `New Folder`.
![](assets/main.png)
This will take you to the import page.
![](assets/import.png)
Use either one of the presented methods (upload json, load from a url or copy paste the json into the text-box), to upload the `codex-grafana-dashboard.json` file.
Finally, you'll be presented with the following screen where you can change the name and the `UID` of the imported dashboard. This is only necessary if there is already a dashboard with the same name or `UID`.
![](./assets/imported.png)
Once imported, the dashboard should show up on the main dashboard page.

BIN
metrics/assets/import.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 84 KiB

BIN
metrics/assets/imported.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 128 KiB

BIN
metrics/assets/main.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

File diff suppressed because it is too large Load Diff

10
metrics/prometheus.yml Normal file
View File

@ -0,0 +1,10 @@
global:
scrape_interval: 12s
scrape_configs:
- job_name: "codex"
static_configs:
- targets: ['127.0.0.1:8008']
- job_name: "node_exporter"
static_configs:
- targets: ['127.0.0.1:9100']