Prometheus & Grafana refactoring

- moved "process_dashboard.nim" in "tools/" - README: made Witti the documented testnet and added instructions for getting metrics out of the local node - moved Prometheus config file generation in its own script - the static Grafana dashboard definition now covers all nodes, using a variable; only the remote testnet dashboards need to be dynamically generated - "launch_local_testnet.sh" no longer needs a "--grafana" option
2020-06-10 17:21:32 +02:00 · 2020-06-10 17:21:32 +02:00 · e2025c5752
parent e7febc2e2b
commit e2025c5752
13 changed files with 333 additions and 297 deletions
--- a/.gitignore
+++ b/.gitignore
@ -33,6 +33,8 @@ build/

 /local_testnet_data*/

+# Prometheus db
+/data
 # Grafana dashboards
 /docker/*.json

--- a/2
+++ b/2
@ -38,7 +38,7 @@ TOOLS_DIRS := \
 	ncli \
 	nbench \
 	research \
-	tests/simulation
+	tools
 TOOLS_CSV := $(subst $(SPACE),$(COMMA),$(TOOLS))

 .PHONY: \
--- a/README.md
+++ b/README.md
@ -102,16 +102,29 @@ apt install build-essential git libpcre3-dev

 Nimbus connects to any of the testnets published in the [eth2-clients/eth2-testnets repo](https://github.com/eth2-clients/eth2-testnets/tree/master/nimbus).

-Once the [prerequisites](#prerequisites) are installed you can connect to testnet0 with the following commands:
+Once the [prerequisites](#prerequisites) are installed you can connect to the [Witti testnet](https://github.com/goerli/witti) with the following commands:

 ```bash
 git clone https://github.com/status-im/nim-beacon-chain
 cd nim-beacon-chain
-make testnet0        # This will build Nimbus and all other dependencies
-                     # and connect you to testnet0
+make witti           # This will build Nimbus and all other dependencies
+                     # and connect you to Witti
 ```

-The testnets are restarted once per week, usually on Monday evenings (UTC)) and integrate the changes for the past week.
+### Getting metrics from a local testnet client
+
+```bash
+# the primitive HTTP server started to serve the metrics is considered insecure
+make NIMFLAGS="-d:insecure" witti
+```
+
+You can now see the raw metrics on http://127.0.0.1:8008/metrics but they're not very useful like this, so let's feed them to a Prometheus instance:
+
+```bash
+prometheus --config.file=build/data/shared_witti/prometheus.yml
+```
+
+For some pretty pictures, get [Grafana](https://grafana.com/) up and running, then import the dashboard definition in "grafana/beacon\_nodes\_Grafana\_dashboard.json".

 ## Interop (for other Eth2 clients)

@ -178,8 +191,8 @@ The [generic instructions from the Nimbus repo](https://github.com/status-im/nim
 Specific steps:

 ```bash
-# This will generate the Prometheus config and the Grafana dashboard on the fly,
-# based on the number of nodes (which you can control by passing something like NODES=6 to `make`).
+# This will generate the Prometheus config on the fly, based on the number of
+# nodes (which you can control by passing something like NODES=6 to `make`).
 # The `-d:insecure` flag starts an HTTP server from which the Prometheus daemon will pull the metrics.
 make VALIDATORS=192 NODES=6 USER_NODES=0 NIMFLAGS="-d:insecure" eth2_network_simulation

@ -188,7 +201,7 @@ cd tests/simulation/prometheus
 prometheus
 ```

-The dashboard you need to import in Grafana is "tests/simulation/beacon-chain-sim-all-nodes-Grafana-dashboard.json".
+The dashboard you need to import in Grafana is "grafana/beacon\_nodes\_Grafana\_dashboard.json".

 ![monitoring dashboard](./media/monitoring.png)

--- a/tests/simulation/beacon-chain-sim-node0-Grafana-dashboard.json
+++ b/tests/simulation/beacon-chain-sim-node0-Grafana-dashboard.json
@ -101,27 +101,27 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "rate(process_cpu_seconds_total{node=\"0\"}[2s]) * 100",
+          "expr": "rate(process_cpu_seconds_total{node=\"${node}\"}[2s]) * 100",
          "legendFormat": "CPU usage %",
          "refId": "A"
        },
        {
-          "expr": "process_open_fds{node=\"0\"}",
+          "expr": "process_open_fds{node=\"${node}\"}",
          "legendFormat": "open file descriptors",
          "refId": "C"
        },
        {
-          "expr": "process_resident_memory_bytes{node=\"0\"}",
+          "expr": "process_resident_memory_bytes{node=\"${node}\"}",
          "legendFormat": "RSS",
          "refId": "D"
        },
        {
-          "expr": "nim_gc_mem_bytes{node=\"0\"}",
+          "expr": "nim_gc_mem_bytes{node=\"${node}\"}",
          "legendFormat": "Nim GC mem total",
          "refId": "F"
        },
        {
-          "expr": "nim_gc_mem_occupied_bytes{node=\"0\"}",
+          "expr": "nim_gc_mem_occupied_bytes{node=\"${node}\"}",
          "legendFormat": "Nim GC mem used",
          "refId": "G"
        }
@ -130,7 +130,7 @@
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
-      "title": "resources #0",
+      "title": "resources #${node}",
      "tooltip": {
        "shared": true,
        "sort": 0,
@ -210,12 +210,12 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "libp2p_open_bufferstream{node=\"0\"}",
+          "expr": "libp2p_open_bufferstream{node=\"${node}\"}",
          "legendFormat": "BufferStream",
          "refId": "A"
        },
        {
-          "expr": "libp2p_open_connection{node=\"0\"}",
+          "expr": "libp2p_open_connection{node=\"${node}\"}",
          "legendFormat": "Connection",
          "refId": "B"
        }
@ -224,7 +224,7 @@
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
-      "title": "open streams #0",
+      "title": "open streams #${node}",
      "tooltip": {
        "shared": true,
        "sort": 0,
@ -304,13 +304,13 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "beacon_current_validators{node=\"0\"}",
+          "expr": "beacon_current_validators{node=\"${node}\"}",
          "interval": "",
          "legendFormat": "current validators",
          "refId": "A"
        },
        {
-          "expr": "beacon_current_live_validators{node=\"0\"}",
+          "expr": "beacon_current_live_validators{node=\"${node}\"}",
          "interval": "",
          "legendFormat": "current live validators",
          "refId": "B"
@ -320,7 +320,7 @@
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
-      "title": "validators #0",
+      "title": "validators #${node}",
      "tooltip": {
        "shared": true,
        "sort": 0,
@ -405,7 +405,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "nim_gc_heap_instance_occupied_bytes{node=\"0\"}",
+          "expr": "nim_gc_heap_instance_occupied_bytes{node=\"${node}\"}",
          "interval": "",
          "legendFormat": "{{type_name}}",
          "refId": "A"
@ -415,7 +415,7 @@
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
-      "title": "GC heap objects #0",
+      "title": "GC heap objects #${node}",
      "tooltip": {
        "shared": true,
        "sort": 0,
@ -493,7 +493,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "beacon_state_data_cache_hits_total{node=\"0\"} * 100 / (beacon_state_data_cache_hits_total{node=\"0\"} + beacon_state_data_cache_misses_total{node=\"0\"})",
+          "expr": "beacon_state_data_cache_hits_total{node=\"${node}\"} * 100 / (beacon_state_data_cache_hits_total{node=\"${node}\"} + beacon_state_data_cache_misses_total{node=\"${node}\"})",
          "interval": "",
          "legendFormat": "cache hit rate",
          "refId": "A"
@ -503,7 +503,7 @@
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
-      "title": "pool.cachedStates #0",
+      "title": "pool.cachedStates #${node}",
      "tooltip": {
        "shared": true,
        "sort": 0,
@ -587,7 +587,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "sqlite3_memory_used_bytes{node=\"0\"}",
+          "expr": "sqlite3_memory_used_bytes{node=\"${node}\"}",
          "interval": "",
          "legendFormat": "Memory used",
          "refId": "A"
@ -597,7 +597,7 @@
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
-      "title": "SQLite3 #0",
+      "title": "SQLite3 #${node}",
      "tooltip": {
        "shared": true,
        "sort": 0,
@ -698,14 +698,14 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "process_resident_memory_bytes{node=\"0\"}",
+          "expr": "process_resident_memory_bytes{node=\"${node}\"}",
          "refId": "A"
        }
      ],
      "thresholds": "",
      "timeFrom": null,
      "timeShift": null,
-      "title": "RSS mem #0",
+      "title": "RSS mem #${node}",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
@ -781,14 +781,14 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "rate(process_cpu_seconds_total{node=\"0\"}[2s]) * 100",
+          "expr": "rate(process_cpu_seconds_total{node=\"${node}\"}[2s]) * 100",
          "refId": "A"
        }
      ],
      "thresholds": "",
      "timeFrom": null,
      "timeShift": null,
-      "title": "CPU usage #0",
+      "title": "CPU usage #${node}",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
@ -864,7 +864,7 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "beacon_slot{node=\"0\"}",
+          "expr": "beacon_slot{node=\"${node}\"}",
          "interval": "",
          "legendFormat": "",
          "refId": "A"
@ -873,7 +873,7 @@
      "thresholds": "",
      "timeFrom": null,
      "timeShift": null,
-      "title": "current slot #0",
+      "title": "current slot #${node}",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
@ -1034,14 +1034,14 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "beacon_attestations_received_total{node=\"0\"}",
+          "expr": "beacon_attestations_received_total{node=\"${node}\"}",
          "refId": "A"
        }
      ],
      "thresholds": "",
      "timeFrom": null,
      "timeShift": null,
-      "title": "att'ns recv'd #0",
+      "title": "att'ns recv'd #${node}",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
@ -1097,13 +1097,13 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "rate(beacon_blocks_received_total{node=\"0\"}[4s]) * 3",
+          "expr": "rate(beacon_blocks_received_total{node=\"${node}\"}[4s]) * 3",
          "interval": "",
          "legendFormat": "received",
          "refId": "B"
        },
        {
-          "expr": "rate(beacon_blocks_proposed_total{node=\"0\"}[4s]) * 3",
+          "expr": "rate(beacon_blocks_proposed_total{node=\"${node}\"}[4s]) * 3",
          "interval": "",
          "legendFormat": "proposed",
          "refId": "A"
@ -1113,7 +1113,7 @@
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
-      "title": "blocks #0",
+      "title": "blocks #${node}",
      "tooltip": {
        "shared": true,
        "sort": 0,
@ -1213,7 +1213,7 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "beacon_current_epoch{node=\"0\"}",
+          "expr": "beacon_current_epoch{node=\"${node}\"}",
          "interval": "",
          "legendFormat": "",
          "refId": "A"
@ -1222,7 +1222,7 @@
      "thresholds": "",
      "timeFrom": null,
      "timeShift": null,
-      "title": "current epoch #0",
+      "title": "current epoch #${node}",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
@ -1297,7 +1297,7 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "beacon_current_justified_epoch{node=\"0\"}",
+          "expr": "beacon_current_justified_epoch{node=\"${node}\"}",
          "interval": "",
          "legendFormat": "",
          "refId": "A"
@ -1306,7 +1306,7 @@
      "thresholds": "",
      "timeFrom": null,
      "timeShift": null,
-      "title": "current justified epoch #0",
+      "title": "current justified epoch #${node}",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
@ -1382,7 +1382,7 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "time() - process_start_time_seconds{node=\"0\"}",
+          "expr": "time() - process_start_time_seconds{node=\"${node}\"}",
          "interval": "",
          "legendFormat": "",
          "refId": "A"
@ -1391,7 +1391,7 @@
      "thresholds": "",
      "timeFrom": null,
      "timeShift": null,
-      "title": "runtime #0",
+      "title": "runtime #${node}",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
@ -1467,14 +1467,14 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "libp2p_peers{node=\"0\"}",
+          "expr": "libp2p_peers{node=\"${node}\"}",
          "refId": "A"
        }
      ],
      "thresholds": "",
      "timeFrom": null,
      "timeShift": null,
-      "title": "peers #0",
+      "title": "peers #${node}",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
@ -1549,7 +1549,7 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "beacon_finalized_epoch{node=\"0\"}",
+          "expr": "beacon_finalized_epoch{node=\"${node}\"}",
          "interval": "",
          "legendFormat": "",
          "refId": "A"
@ -1558,7 +1558,7 @@
      "thresholds": "",
      "timeFrom": null,
      "timeShift": null,
-      "title": "last finalized epoch #0",
+      "title": "last finalized epoch #${node}",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
@ -1611,13 +1611,13 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "rate(beacon_attestations_received_total{node=\"0\"}[4s]) * 3",
+          "expr": "rate(beacon_attestations_received_total{node=\"${node}\"}[4s]) * 3",
          "interval": "",
          "legendFormat": "received",
          "refId": "A"
        },
        {
-          "expr": "rate(beacon_attestations_sent_total{node=\"0\"}[4s]) * 3",
+          "expr": "rate(beacon_attestations_sent_total{node=\"${node}\"}[4s]) * 3",
          "interval": "",
          "legendFormat": "sent",
          "refId": "B"
@ -1627,7 +1627,7 @@
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
-      "title": "attestations #0",
+      "title": "attestations #${node}",
      "tooltip": {
        "shared": true,
        "sort": 0,
@ -1697,7 +1697,7 @@
      "reverseYBuckets": false,
      "targets": [
        {
-          "expr": "rate(beacon_attestation_received_seconds_from_slot_start_bucket{node=\"0\"}[4s]) * 3",
+          "expr": "rate(beacon_attestation_received_seconds_from_slot_start_bucket{node=\"${node}\"}[4s]) * 3",
          "format": "heatmap",
          "instant": false,
          "interval": "",
@ -1708,7 +1708,7 @@
      ],
      "timeFrom": null,
      "timeShift": null,
-      "title": "received attestation delay (s) #0",
+      "title": "received attestation delay (s) #${node}",
      "tooltip": {
        "show": true,
        "showHistogram": false
@ -1738,7 +1738,35 @@
  "style": "dark",
  "tags": [],
  "templating": {
-    "list": []
+    "list": [
+      {
+        "allValue": null,
+        "current": {
+          "tags": [],
+          "text": "0",
+          "value": "0"
+        },
+        "datasource": "Prometheus",
+        "definition": "label_values(process_virtual_memory_bytes,node)",
+        "hide": 0,
+        "includeAll": false,
+        "index": -1,
+        "label": null,
+        "multi": false,
+        "name": "node",
+        "options": [],
+        "query": "label_values(process_virtual_memory_bytes,node)",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
  },
  "time": {
    "from": "now-15m",
@ -1759,10 +1787,10 @@
    ]
  },
  "timezone": "",
-  "title": "beacon chain sim (node0)",
-  "uid": "pgeNfj2Wz2",
+  "title": "NBC local testnet/sim (all nodes)",
+  "uid": "pgeNfj2Wz2a",
  "variables": {
    "list": []
  },
  "version": 38
-}
+}
--- a/media/monitoring.png
+++ b/media/monitoring.png
--- a/scripts/connect_to_testnet.nims
+++ b/scripts/connect_to_testnet.nims
@ -112,6 +112,9 @@ cli do (skipGoerliKey {.
      rmDir dataDir

  cd rootDir
+
+  exec &"""./scripts/make_prometheus_config.sh --nodes 1 --base-metrics-port 8008 --config-file "{dataDir}/prometheus.yml""""
+
  exec &"""nim c {nimFlags} -d:"const_preset={preset}" -o:"{beaconNodeBinary}" beacon_chain/beacon_node.nim"""

  proc execIgnoringExitCode(s: string) =
--- a/scripts/launch_local_testnet.sh
+++ b/scripts/launch_local_testnet.sh
@ -24,7 +24,7 @@ if [ ${PIPESTATUS[0]} != 4 ]; then
 fi

 OPTS="ht:n:d:"
-LONGOPTS="help,testnet:,nodes:,data-dir:,disable-htop,log-level:,grafana,base-port:,base-metrics-port:"
+LONGOPTS="help,testnet:,nodes:,data-dir:,disable-htop,log-level:,base-port:,base-metrics-port:"

 # default values
 TESTNET="1"
@ -32,7 +32,6 @@ NUM_NODES="10"
 DATA_DIR="local_testnet_data"
 USE_HTOP="1"
 LOG_LEVEL="DEBUG"
-ENABLE_GRAFANA="0"
 BASE_PORT="9000"
 BASE_METRICS_PORT="8008"

@ -51,7 +50,6 @@ CI run: $(basename $0) --disable-htop -- --verify-finalization --stop-at-epoch=5
      --base-metrics-port	bootstrap node's metrics server port (default: ${BASE_METRICS_PORT})
      --disable-htop		don't use "htop" to see the beacon_node processes
      --log-level		set the log level (default: ${LOG_LEVEL})
-      --grafana			generate Grafana dashboards (and Prometheus config file)
 EOF
 }

@ -89,10 +87,6 @@ while true; do
 			LOG_LEVEL="$2"
 			shift 2
 			;;
-		--grafana)
-			ENABLE_GRAFANA="1"
-			shift
-			;;
 		--base-port)
 			BASE_PORT="$2"
 			shift 2
@ -137,7 +131,7 @@ else
 fi

 NETWORK_NIM_FLAGS=$(scripts/load-testnet-nim-flags.sh ${NETWORK})
-$MAKE -j2 LOG_LEVEL="${LOG_LEVEL}" NIMFLAGS="-d:insecure -d:testnet_servers_image ${NETWORK_NIM_FLAGS}" beacon_node process_dashboard
+$MAKE LOG_LEVEL="${LOG_LEVEL}" NIMFLAGS="-d:insecure -d:testnet_servers_image ${NETWORK_NIM_FLAGS}" beacon_node

 ./build/beacon_node makeDeposits \
 	--quickstart-deposits=${QUICKSTART_VALIDATORS} \
@ -157,29 +151,10 @@ BOOTSTRAP_IP="127.0.0.1"
 	--bootstrap-port=${BASE_PORT} \
 	--genesis-offset=30 # Delay in seconds

-if [[ "$ENABLE_GRAFANA" == "1" ]]; then
-	# Prometheus config
-	cat > "${DATA_DIR}/prometheus.yml" <<EOF
-global:
-  scrape_interval: 1s
-
-scrape_configs:
-  - job_name: "nimbus"
-    static_configs:
-EOF
-	for NUM_NODE in $(seq 0 $(( ${NUM_NODES} - 1 ))); do
-		cat >> "${DATA_DIR}/prometheus.yml" <<EOF
-      - targets: ['127.0.0.1:$(( BASE_METRICS_PORT + NUM_NODE ))']
-        labels:
-          node: '$NUM_NODE'
-EOF
-	done
-
-	# use the exported Grafana dashboard for a single node to create one for all nodes
-	./build/process_dashboard \
-	  --in="tests/simulation/beacon-chain-sim-node0-Grafana-dashboard.json" \
-	  --out="${DATA_DIR}/local-testnet-all-nodes-Grafana-dashboard.json"
-fi
+./scripts/make_prometheus_config.sh \
+	--nodes ${NUM_NODES} \
+	--base-metrics-port ${BASE_METRICS_PORT} \
+	--config-file "${DATA_DIR}/prometheus.yml"

 # Kill child processes on Ctrl-C/SIGTERM/exit, passing the PID of this shell
 # instance as the parent and the target process name as a pattern to the
--- a/scripts/make_prometheus_config.sh
+++ b/scripts/make_prometheus_config.sh
@ -0,0 +1,92 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Status Research & Development GmbH. Licensed under
+# either of:
+# - Apache License, version 2.0
+# - MIT license
+# at your option. This file may not be copied, modified, or distributed except
+# according to those terms.
+
+set -e
+
+####################
+# argument parsing #
+####################
+! getopt --test > /dev/null
+if [ ${PIPESTATUS[0]} != 4 ]; then
+	echo '`getopt --test` failed in this environment.'
+	exit 1
+fi
+
+OPTS="h"
+LONGOPTS="help,nodes:,base-metrics-port:,config-file:"
+
+# default values
+NUM_NODES="10"
+BASE_METRICS_PORT="8008"
+CONFIG_FILE="prometheus.yml"
+
+print_help() {
+	cat <<EOF
+Usage: $(basename $0) --nodes ${NUM_NODES} --base-metrics-port ${BASE_METRICS_PORT} --config-file "${CONFIG_FILE}"
+
+  -h, --help			this help message
+      --nodes			number of nodes to launch (default: ${NUM_NODES})
+      --base-metrics-port	bootstrap node's metrics server port (default: ${BASE_METRICS_PORT})
+      --config-file		write the Prometheus config to this file (default: ${CONFIG_FILE})
+EOF
+}
+
+! PARSED=$(getopt --options=${OPTS} --longoptions=${LONGOPTS} --name "$0" -- "$@")
+if [ ${PIPESTATUS[0]} != 0 ]; then
+	# getopt has complained about wrong arguments to stdout
+	exit 1
+fi
+
+# read getopt's output this way to handle the quoting right
+eval set -- "$PARSED"
+while true; do
+	case "$1" in
+		-h|--help)
+			print_help
+			exit
+			;;
+		-n|--nodes)
+			NUM_NODES="$2"
+			shift 2
+			;;
+		--base-metrics-port)
+			BASE_METRICS_PORT="$2"
+			shift 2
+			;;
+		--config-file)
+			CONFIG_FILE="$2"
+			shift 2
+			;;
+		--)
+			shift
+			break
+			;;
+		*)
+			echo "argument parsing error"
+			print_help
+			exit 1
+	esac
+done
+
+cat > "${CONFIG_FILE}" <<EOF
+global:
+  scrape_interval: 1s
+
+scrape_configs:
+  - job_name: "nimbus"
+    static_configs:
+EOF
+for NUM_NODE in $(seq 0 $(( ${NUM_NODES} - 1 ))); do
+	cat >> "${CONFIG_FILE}" <<EOF
+      - targets: ['127.0.0.1:$(( BASE_METRICS_PORT + NUM_NODE ))']
+        labels:
+          node: '$NUM_NODE'
+EOF
+done
+
--- a/scripts/reset_testnet.sh
+++ b/scripts/reset_testnet.sh
@ -69,9 +69,8 @@ make -j2 NIMFLAGS="-d:insecure -d:testnet_servers_image ${NETWORK_NIM_FLAGS}" be
 echo "Generating Grafana dashboards for remote testnet servers"
 for testnet in 0 1; do
  ./build/process_dashboard \
-    --in="tests/simulation/beacon-chain-sim-node0-Grafana-dashboard.json" \
-    --out="docker/beacon-chain-sim-remote-testnet${testnet}-Grafana-dashboard.json" \
-    --type="remote" \
+    --in="grafana/beacon_nodes_Grafana_dashboard.json" \
+    --out="docker/remote_testnet${testnet}_Grafana_dashboard.json" \
    --testnet="${testnet}"
 done

--- a/tests/simulation/.gitignore
+++ b/tests/simulation/.gitignore
@ -1,5 +1,4 @@
 data/
 validators/
 prometheus/
-beacon-chain-sim-all-nodes-Grafana-dashboard.json

--- a/tests/simulation/process_dashboard.nim
+++ b/tests/simulation/process_dashboard.nim
@ -1,180 +0,0 @@
-import json, parseopt, strutils
-
-# usage: process_dashboard --in=node0_dashboard.json --out=all_nodes_dashboard.json --type=local --testnet=0
-type
-  OutputType = enum
-    local
-    remote
-var
-  p = initOptParser()
-  inputFileName, outputFilename: string
-  outputType = OutputType.local
-  testnet = 0
-
-while true:
-  p.next()
-  case p.kind:
-    of cmdEnd:
-      break
-    of cmdShortOption, cmdLongOption:
-      if p.key == "in":
-        inputFileName = p.val
-      elif p.key == "out":
-        outputFileName = p.val
-      elif p.key == "type":
-        outputType = parseEnum[OutputType](p.val)
-      elif p.key == "testnet":
-        testnet = p.val.parseInt()
-      else:
-        echo "unsupported argument: ", p.key
-    of cmdArgument:
-      echo "unsupported argument: ", p.key
-
-var
-  inputData = parseFile(inputFileName)
-  panels = inputData["panels"].copy()
-  outputData = inputData
-
-#############
-# variables #
-#############
-
-case outputType:
-  of OutputType.local:
-    outputData["templating"]["list"] = parseJson("""
-      [
-        {
-          "allValue": null,
-          "current": {
-            "tags": [],
-            "text": "0",
-            "value": "0"
-          },
-          "datasource": "Prometheus",
-          "definition": "label_values(process_virtual_memory_bytes,node)",
-          "hide": 0,
-          "includeAll": false,
-          "index": -1,
-          "label": null,
-          "multi": false,
-          "name": "node",
-          "options": [],
-          "query": "label_values(process_virtual_memory_bytes,node)",
-          "refresh": 1,
-          "regex": "",
-          "skipUrlSync": false,
-          "sort": 0,
-          "tagValuesQuery": "",
-          "tags": [],
-          "tagsQuery": "",
-          "type": "query",
-          "useTags": false
-        }
-      ]
-    """)
-  of OutputType.remote:
-    outputData["templating"]["list"] = parseJson("""
-      [
-        {
-          "allValue": null,
-          "current": {
-            "tags": [],
-            "text": "beacon-node-testnet""" & $testnet & """-1",
-            "value": "beacon-node-testnet""" & $testnet & """-1"
-          },
-          "datasource": "master-01.do-ams3.metrics.hq",
-          "definition": "label_values(process_virtual_memory_bytes{job=\"beacon-node-metrics\"},container)",
-          "hide": 0,
-          "includeAll": false,
-          "index": -1,
-          "label": null,
-          "multi": false,
-          "name": "container",
-          "options": [],
-          "query": "label_values(process_virtual_memory_bytes{job=\"beacon-node-metrics\"},container)",
-          "refresh": 1,
-          "regex": "/.*testnet""" & $testnet & """.*/",
-          "skipUrlSync": false,
-          "sort": 1,
-          "tagValuesQuery": "",
-          "tags": [],
-          "tagsQuery": "",
-          "type": "query",
-          "useTags": false
-        },
-        {
-          "allValue": null,
-          "current": {
-            "tags": [],
-            "text": "master-01.aws-eu-central-1a.nimbus.test",
-            "value": "master-01.aws-eu-central-1a.nimbus.test"
-          },
-          "datasource": "master-01.do-ams3.metrics.hq",
-          "definition": "label_values(process_virtual_memory_bytes{job=\"beacon-node-metrics\"},instance)",
-          "hide": 0,
-          "includeAll": false,
-          "index": -1,
-          "label": null,
-          "multi": false,
-          "name": "instance",
-          "options": [],
-          "query": "label_values(process_virtual_memory_bytes{job=\"beacon-node-metrics\"},instance)",
-          "refresh": 1,
-          "regex": "",
-          "skipUrlSync": false,
-          "sort": 1,
-          "tagValuesQuery": "",
-          "tags": [],
-          "tagsQuery": "",
-          "type": "query",
-          "useTags": false
-        }
-      ]
-    """)
-
-##########
-# panels #
-##########
-
-outputData["panels"] = %* []
-for panel in panels.mitems:
-  case outputType:
-    of OutputType.local:
-      panel["title"] = %* replace(panel["title"].getStr(), "#0", "#${node}")
-    of OutputType.remote:
-      panel["title"] = %* replace(panel["title"].getStr(), "#0", "#${container}@${instance}")
-      panel["datasource"] = newJNull()
-  if panel.hasKey("targets"):
-    var targets = panel["targets"]
-    for target in targets.mitems:
-      case outputType:
-        of OutputType.local:
-          target["expr"] = %* replace(target["expr"].getStr(), "{node=\"0\"}", "{node=\"${node}\"}")
-        of OutputType.remote:
-          # The remote Prometheus instance polls once per minute, so the
-          # minimum rate() interval is 2 minutes.
-          target["expr"] = %* multiReplace(target["expr"].getStr(),
-                                ("{node=\"0\"}", "{job=\"beacon-node-metrics\",container=\"${container}\",instance=\"${instance}\"}"),
-                                ("sum(beacon_attestations_sent_total)", "sum(beacon_attestations_sent_total{job=\"beacon-node-metrics\",container=~\"beacon-node-testnet" & $testnet & "-.\"})"),
-                                ("[2s]", "[2m]"),
-                                ("[4s]) * 3", "[2m]) * 120"))
-  outputData["panels"].add(panel)
-
-########
-# misc #
-########
-
-case outputType:
-  of OutputType.local:
-    outputData["title"] = %* "NBC local testnet/sim (all nodes)"
-    outputData["uid"] = %* (outputData["uid"].getStr() & "a")
-  of OutputType.remote:
-    outputData["title"] = %* ("Nimbus testnet" & $testnet)
-    outputData["uid"] = %* (outputData["uid"].getStr() & $testnet)
-    # our annotations only work with a 1s resolution
-    var annotation = outputData["annotations"]["list"][0].copy()
-    annotation["datasource"] = %* "-- Grafana --"
-    outputData["annotations"]["list"] = %* [annotation]
-
-writeFile(outputFilename, pretty(outputData))
-
--- a/tests/simulation/start.sh
+++ b/tests/simulation/start.sh
@ -52,25 +52,10 @@ type "$GANACHE" &>/dev/null || { echo $GANACHE is missing; USE_GANACHE="no"; }
 USE_PROMETHEUS="${LAUNCH_PROMETHEUS:-no}"
 type "$PROMETHEUS" &>/dev/null || { echo $PROMETHEUS is missing; USE_PROMETHEUS="no"; }

-# Prometheus config (continued inside the loop)
-mkdir -p "${METRICS_DIR}"
-cat > "${METRICS_DIR}/prometheus.yml" <<EOF
-global:
-  scrape_interval: 1s
-
-scrape_configs:
-  - job_name: "nimbus"
-    static_configs:
-EOF
-
-for i in $(seq $MASTER_NODE -1 $TOTAL_USER_NODES); do
-  # Prometheus config
-  cat >> "${METRICS_DIR}/prometheus.yml" <<EOF
-      - targets: ['127.0.0.1:$(( BASE_METRICS_PORT + i ))']
-        labels:
-          node: '$i'
-EOF
-done
+./scripts/make_prometheus_config.sh \
+	--nodes ${TOTAL_NODES} \
+	--base-metrics-port ${BASE_METRICS_PORT} \
+	--config-file "${METRICS_DIR}/prometheus.yml"

 COMMANDS=()

@ -110,7 +95,7 @@ if [[ "$USE_TMUX" != "no" ]]; then
  $TMUX select-window -t "${TMUX_SESSION_NAME}:sim"
 fi

-$MAKE -j3 --no-print-directory NIMFLAGS="$CUSTOM_NIMFLAGS $DEFS" LOG_LEVEL="${LOG_LEVEL:-DEBUG}" beacon_node validator_client process_dashboard deposit_contract
+$MAKE -j3 --no-print-directory NIMFLAGS="$CUSTOM_NIMFLAGS $DEFS" LOG_LEVEL="${LOG_LEVEL:-DEBUG}" beacon_node validator_client deposit_contract

 if [ ! -f "${LAST_VALIDATOR}" ]; then
  if [ "$WEB3_ARG" != "" ]; then
@ -164,12 +149,6 @@ if [ -f "${MASTER_NODE_ADDRESS_FILE}" ]; then
  rm "${MASTER_NODE_ADDRESS_FILE}"
 fi

-# use the exported Grafana dashboard for a single node to create one for all nodes
-echo Creating grafana dashboards...
-./build/process_dashboard \
-  --in="${SIM_ROOT}/beacon-chain-sim-node0-Grafana-dashboard.json" \
-  --out="${SIM_ROOT}/beacon-chain-sim-all-nodes-Grafana-dashboard.json"
-
 # Kill child processes on Ctrl-C/SIGTERM/exit, passing the PID of this shell
 # instance as the parent and the target process name as a pattern to the
 # "pkill" command.
--- a/tools/process_dashboard.nim
+++ b/tools/process_dashboard.nim
@ -0,0 +1,126 @@
+import json, parseopt, strutils
+
+# usage: process_dashboard --in=local_dashboard.json --out=remote_dashboard.json --testnet=0
+var
+  p = initOptParser()
+  inputFileName, outputFilename: string
+  testnet = 0
+
+while true:
+  p.next()
+  case p.kind:
+    of cmdEnd:
+      break
+    of cmdShortOption, cmdLongOption:
+      if p.key == "in":
+        inputFileName = p.val
+      elif p.key == "out":
+        outputFileName = p.val
+      elif p.key == "testnet":
+        testnet = p.val.parseInt()
+      else:
+        echo "unsupported argument: ", p.key
+    of cmdArgument:
+      echo "unsupported argument: ", p.key
+
+var
+  inputData = parseFile(inputFileName)
+  panels = inputData["panels"].copy()
+  outputData = inputData
+
+#############
+# variables #
+#############
+
+outputData["templating"]["list"] = parseJson("""
+  [
+    {
+      "allValue": null,
+      "current": {
+        "tags": [],
+        "text": "beacon-node-testnet""" & $testnet & """-1",
+        "value": "beacon-node-testnet""" & $testnet & """-1"
+      },
+      "datasource": "master-01.do-ams3.metrics.hq",
+      "definition": "label_values(process_virtual_memory_bytes{job=\"beacon-node-metrics\"},container)",
+      "hide": 0,
+      "includeAll": false,
+      "index": -1,
+      "label": null,
+      "multi": false,
+      "name": "container",
+      "options": [],
+      "query": "label_values(process_virtual_memory_bytes{job=\"beacon-node-metrics\"},container)",
+      "refresh": 1,
+      "regex": "/.*testnet""" & $testnet & """.*/",
+      "skipUrlSync": false,
+      "sort": 1,
+      "tagValuesQuery": "",
+      "tags": [],
+      "tagsQuery": "",
+      "type": "query",
+      "useTags": false
+    },
+    {
+      "allValue": null,
+      "current": {
+        "tags": [],
+        "text": "master-01.aws-eu-central-1a.nimbus.test",
+        "value": "master-01.aws-eu-central-1a.nimbus.test"
+      },
+      "datasource": "master-01.do-ams3.metrics.hq",
+      "definition": "label_values(process_virtual_memory_bytes{job=\"beacon-node-metrics\"},instance)",
+      "hide": 0,
+      "includeAll": false,
+      "index": -1,
+      "label": null,
+      "multi": false,
+      "name": "instance",
+      "options": [],
+      "query": "label_values(process_virtual_memory_bytes{job=\"beacon-node-metrics\"},instance)",
+      "refresh": 1,
+      "regex": "",
+      "skipUrlSync": false,
+      "sort": 1,
+      "tagValuesQuery": "",
+      "tags": [],
+      "tagsQuery": "",
+      "type": "query",
+      "useTags": false
+    }
+  ]
+""")
+
+##########
+# panels #
+##########
+
+outputData["panels"] = %* []
+for panel in panels.mitems:
+  panel["title"] = %* replace(panel["title"].getStr(), "${node}", "${container}@${instance}")
+  panel["datasource"] = newJNull()
+  if panel.hasKey("targets"):
+    var targets = panel["targets"]
+    for target in targets.mitems:
+      # The remote Prometheus instance polls once per minute, so the
+      # minimum rate() interval is 2 minutes.
+      target["expr"] = %* multiReplace(target["expr"].getStr(),
+                            ("{node=\"${node}\"}", "{job=\"beacon-node-metrics\",container=\"${container}\",instance=\"${instance}\"}"),
+                            ("sum(beacon_attestations_sent_total)", "sum(beacon_attestations_sent_total{job=\"beacon-node-metrics\",container=~\"beacon-node-testnet" & $testnet & "-.\"})"),
+                            ("[2s]", "[2m]"),
+                            ("[4s]) * 3", "[2m]) * 120"))
+  outputData["panels"].add(panel)
+
+########
+# misc #
+########
+
+outputData["title"] = %* ("Nimbus testnet" & $testnet)
+outputData["uid"] = %* (outputData["uid"].getStr()[0..^2] & $testnet)
+# our annotations only work with a 1s resolution
+var annotation = outputData["annotations"]["list"][0].copy()
+annotation["datasource"] = %* "-- Grafana --"
+outputData["annotations"]["list"] = %* [annotation]
+
+writeFile(outputFilename, pretty(outputData))
+