chore: Added metrics to liteprotocoltester (#3002)

* Added metrics to tests Fix liteprotocoltester docker files with libnegentropy COPY docker compose with waku-sim simulation now having test performance dashboard and localhost:3033 Mention dashboard in Readme * Update apps/liteprotocoltester/statistics.nim Co-authored-by: Ivan FB <128452529+Ivansete-status@users.noreply.github.com> * indent fix, more stable finding of service/bootstrap nodes, pre-set for TWN --------- Co-authored-by: Ivan FB <128452529+Ivansete-status@users.noreply.github.com>
2024-09-04 08:35:51 +02:00 · 2024-09-04 08:35:51 +02:00 · 8baf627feb
parent 19feb6bd58
commit 8baf627feb
20 changed files with 1240 additions and 6169 deletions
--- a/apps/liteprotocoltester/.env
+++ b/apps/liteprotocoltester/.env
@ -1,4 +1,4 @@
-START_PUBLISHING_AFTER=10
+START_PUBLISHING_AFTER=45
 # can add some seconds delay before SENDER starts publishing

 NUM_MESSAGES=0
@ -12,9 +12,9 @@ MIN_MESSAGE_SIZE=15Kb
 MAX_MESSAGE_SIZE=145Kb

 ## for wakusim
-PUBSUB=/waku/2/rs/66/0
-CONTENT_TOPIC=/tester/2/light-pubsub-test/wakusim
-CLUSTER_ID=66
+#PUBSUB=/waku/2/rs/66/0
+#CONTENT_TOPIC=/tester/2/light-pubsub-test/wakusim
+#CLUSTER_ID=66

 ## for status.prod
 #PUBSUB=/waku/2/rs/16/32
@ -22,6 +22,6 @@ CLUSTER_ID=66
 #CLUSTER_ID=16

 ## for TWN
-#PUBSUB=/waku/2/rs/1/4
-#CONTENT_TOPIC=/tester/2/light-pubsub-test/twn
-#CLUSTER_ID=1
+PUBSUB=/waku/2/rs/1/4
+CONTENT_TOPIC=/tester/2/light-pubsub-test/twn
+CLUSTER_ID=1
--- a/apps/liteprotocoltester/Dockerfile.liteprotocoltester
+++ b/apps/liteprotocoltester/Dockerfile.liteprotocoltester
@ -29,6 +29,8 @@

  COPY build/liteprotocoltester /usr/bin/
  COPY apps/liteprotocoltester/run_tester_node.sh /usr/bin/
+  COPY ./libnegentropy.so /usr/lib/
+

  ENTRYPOINT ["/usr/bin/run_tester_node.sh", "/usr/bin/liteprotocoltester"]

--- a/apps/liteprotocoltester/Dockerfile.liteprotocoltester.compile
+++ b/apps/liteprotocoltester/Dockerfile.liteprotocoltester.compile
@ -51,6 +51,7 @@

  # Copy migration scripts for DB upgrades
  COPY --from=nim-build /app/migrations/ /app/migrations/
+  COPY --from=nim-build /app/libnegentropy.so /usr/lib/

  ENTRYPOINT ["/usr/bin/liteprotocoltester"]

--- a/apps/liteprotocoltester/Dockerfile.liteprotocoltester.copy
+++ b/apps/liteprotocoltester/Dockerfile.liteprotocoltester.copy
@ -1,35 +0,0 @@
-  # TESTING IMAGE --------------------------------------------------------------
-
-    ## NOTICE: This is a short cut build file for ubuntu users who compiles nwaku in ubuntu distro.
-    ##         This is used for faster turnaround time for testing the compiled binary.
-    ##         Prerequisites: compiled liteprotocoltester binary in build/ directory
-
-  FROM ubuntu:noble AS prod
-
-  LABEL maintainer="jakub@status.im"
-  LABEL source="https://github.com/waku-org/nwaku"
-  LABEL description="Lite Protocol Tester: Waku light-client"
-  LABEL commit="unknown"
-  LABEL version="unknown"
-
-  # DevP2P, LibP2P, and JSON RPC ports
-  EXPOSE 30303 60000 8545
-
-  # Referenced in the binary
-  RUN apt-get update && apt-get install -y --no-install-recommends \
-      libgcc1 \
-      libpcre3 \
-      libpq-dev \
-      wget \
-      iproute2 \
-      && rm -rf /var/lib/apt/lists/*
-
-  # Fix for 'Error loading shared library libpcre.so.3: No such file or directory'
-  RUN ln -s /usr/lib/libpcre.so /usr/lib/libpcre.so.3
-
-  COPY build/liteprotocoltester /usr/bin/
-
-  ENTRYPOINT ["/usr/bin/liteprotocoltester"]
-
-  # # By default just show help if called without arguments
-  CMD ["--help"]
--- a/apps/liteprotocoltester/README.md
+++ b/apps/liteprotocoltester/README.md
@ -96,6 +96,10 @@ docker compose -f docker-compose-on-simularor.yml logs -f receivernode
 - Notice there is a configurable wait before start publishing messages as it is noticed time is needed for the service nodes to get connected to full nodes from simulator
 - light clients will print report on their and the connected service node's connectivity to the network in every 20 secs.

+#### Test monitoring
+
+Navigate to http://localhost:3033 to see the lite-protocol-tester dashboard.
+
 ### Phase 3

 > Run independently on a chosen waku fleet
--- a/apps/liteprotocoltester/docker-compose-on-simularor.yml
+++ b/apps/liteprotocoltester/docker-compose-on-simularor.yml
@ -62,7 +62,7 @@ services:
    image: waku.liteprotocoltester:latest
    build:
      context: ../..
-      dockerfile: ./apps/liteprotocoltester/Dockerfile.liteprotocoltester.copy
+      dockerfile: ./apps/liteprotocoltester/Dockerfile.liteprotocoltester
    deploy:
      replicas: ${NUM_PUBLISHER_NODES:-3}
    # ports:
@ -84,13 +84,12 @@ services:
        - *rln_env
        - *test_running_conditions
    volumes:
-      - ./run_tester_node.sh:/opt/run_tester_node.sh:Z
      - ${CERTS_DIR:-./certs}:/etc/letsencrypt/:Z
      - ./rln_tree:/etc/rln_tree/:Z
      - ./keystore:/keystore:Z
    entrypoint: sh
    command:
-      - /opt/run_tester_node.sh
+      - /usr/bin/run_tester_node.sh
      - /usr/bin/liteprotocoltester
      - SENDER
      - waku-sim
@ -139,7 +138,7 @@ services:
    image: waku.liteprotocoltester:latest
    build:
      context: ../..
-      dockerfile: ./apps/liteprotocoltester/Dockerfile.liteprotocoltester.copy
+      dockerfile: ./apps/liteprotocoltester/Dockerfile.liteprotocoltester
    deploy:
      replicas: ${NUM_RECEIVER_NODES:-1}
    # ports:
@ -161,13 +160,12 @@ services:
        - *rln_env
        - *test_running_conditions
    volumes:
-      - ./run_tester_node.sh:/opt/run_tester_node.sh:Z
      - ${CERTS_DIR:-./certs}:/etc/letsencrypt/:Z
      - ./rln_tree:/etc/rln_tree/:Z
      - ./keystore:/keystore:Z
    entrypoint: sh
    command:
-      - /opt/run_tester_node.sh
+      - /usr/bin/run_tester_node.sh
      - /usr/bin/liteprotocoltester
      - RECEIVER
      - waku-sim
@ -180,34 +178,44 @@ services:
    networks:
      - waku-simulator_simulation

-  ## We have prometheus and grafana defined in waku-simulator already
-  # prometheus:
-  #   image: docker.io/prom/prometheus:latest
-  #   volumes:
-  #     - ./monitoring/prometheus-config.yml:/etc/prometheus/prometheus.yml:Z
-  #   command:
-  #     - --config.file=/etc/prometheus/prometheus.yml
-  #   ports:
-  #     - 127.0.0.1:9090:9090
-  #   depends_on:
-  #     - servicenode
+  # We have prometheus and grafana defined in waku-simulator already
+  prometheus:
+    image: docker.io/prom/prometheus:latest
+    volumes:
+      - ./monitoring/prometheus-config.yml:/etc/prometheus/prometheus.yml:Z
+    command:
+      - --config.file=/etc/prometheus/prometheus.yml
+      - --web.listen-address=:9099
+    # ports:
+    #   - 127.0.0.1:9090:9090
+    restart: on-failure:5
+    depends_on:
+      - filter-service
+      - lightpush-service
+      - publishernode
+      - receivernode
+    networks:
+      - waku-simulator_simulation

-  # grafana:
-  #   image: docker.io/grafana/grafana:latest
-  #   env_file:
-  #     - ./monitoring/configuration/grafana-plugins.env
-  #   volumes:
-  #     - ./monitoring/configuration/grafana.ini:/etc/grafana/grafana.ini:Z
-  #     - ./monitoring/configuration/dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z
-  #     - ./monitoring/configuration/datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z
-  #     - ./monitoring/configuration/dashboards:/var/lib/grafana/dashboards/:Z
-  #     - ./monitoring/configuration/customizations/custom-logo.svg:/usr/share/grafana/public/img/grafana_icon.svg:Z
-  #     - ./monitoring/configuration/customizations/custom-logo.svg:/usr/share/grafana/public/img/grafana_typelogo.svg:Z
-  #     - ./monitoring/configuration/customizations/custom-logo.png:/usr/share/grafana/public/img/fav32.png:Z
-  #   ports:
-  #     - 0.0.0.0:3000:3000
-  #   depends_on:
-  #     - prometheus
+  grafana:
+    image: docker.io/grafana/grafana:latest
+    env_file:
+      - ./monitoring/configuration/grafana-plugins.env
+    volumes:
+      - ./monitoring/configuration/grafana.ini:/etc/grafana/grafana.ini:Z
+      - ./monitoring/configuration/dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z
+      - ./monitoring/configuration/datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z
+      - ./monitoring/configuration/dashboards:/var/lib/grafana/dashboards/:Z
+      - ./monitoring/configuration/customizations/custom-logo.svg:/usr/share/grafana/public/img/grafana_icon.svg:Z
+      - ./monitoring/configuration/customizations/custom-logo.svg:/usr/share/grafana/public/img/grafana_typelogo.svg:Z
+      - ./monitoring/configuration/customizations/custom-logo.png:/usr/share/grafana/public/img/fav32.png:Z
+    ports:
+      - 0.0.0.0:3033:3033
+    restart: on-failure:5
+    depends_on:
+      - prometheus
+    networks:
+      - waku-simulator_simulation

 configs:
  cfg_tester_node.toml:
--- a/apps/liteprotocoltester/docker-compose.yml
+++ b/apps/liteprotocoltester/docker-compose.yml
@ -59,7 +59,7 @@ services:
    image: waku.liteprotocoltester:latest
    build:
      context: ../..
-      dockerfile: ./apps/liteprotocoltester/Dockerfile.liteprotocoltester.copy
+      dockerfile: ./apps/liteprotocoltester/Dockerfile.liteprotocoltester
    ports:
      # - 30304:30304/tcp
      # - 30304:30304/udp
@ -79,13 +79,12 @@ services:
        - *rln_env
        - *test_running_conditions
    volumes:
-      - ./run_tester_node.sh:/opt/run_tester_node.sh:Z
      - ${CERTS_DIR:-./certs}:/etc/letsencrypt/:Z
      - ./rln_tree:/etc/rln_tree/:Z
      - ./keystore:/keystore:Z
    entrypoint: sh
    command:
-      - /opt/run_tester_node.sh
+      - /usr/bin/run_tester_node.sh
      - /usr/bin/liteprotocoltester
      - SENDER
      - servicenode
@ -99,7 +98,7 @@ services:
    image: waku.liteprotocoltester:latest
    build:
      context: ../..
-      dockerfile: ./apps/liteprotocoltester/Dockerfile.liteprotocoltester.copy
+      dockerfile: ./apps/liteprotocoltester/Dockerfile.liteprotocoltester
    ports:
      # - 30304:30304/tcp
      # - 30304:30304/udp
@ -125,7 +124,7 @@ services:
      - ./keystore:/keystore:Z
    entrypoint: sh
    command:
-      - /opt/run_tester_node.sh
+      - /usr/bin/run_tester_node.sh
      - /usr/bin/liteprotocoltester
      - RECEIVER
      - servicenode
--- a/apps/liteprotocoltester/lightpush_publisher.nim
+++ b/apps/liteprotocoltester/lightpush_publisher.nim
@ -17,7 +17,8 @@ import
    common/utils/parse_size_units,
  ],
  ./tester_config,
-  ./tester_message
+  ./tester_message,
+  ./lpt_metrics

 randomize()

@ -141,12 +142,15 @@ proc publishMessages(
        pubsubTopic = lightpushPubsubTopic,
        hash = msgHash
      inc(messagesSent)
+      lpt_publisher_sent_messages_count.inc()
+      lpt_publisher_sent_bytes.inc(amount = msgSize.int64)
    else:
      sentMessages[messagesSent] = (hash: msgHash, relayed: false)
      failedToSendCause.mgetOrPut(wlpRes.error, 1).inc()
      error "failed to publish message using lightpush",
        err = wlpRes.error, hash = msgHash
      inc(failedToSendCount)
+      lpt_publisher_failed_messages_count.inc(labelValues = [wlpRes.error])

    await sleepAsync(delayMessages)

--- a/apps/liteprotocoltester/liteprotocoltester.nim
+++ b/apps/liteprotocoltester/liteprotocoltester.nim
@ -96,6 +96,10 @@ when isMainModule:
  wakuConf.clusterId = conf.clusterId
  ## TODO: Depending on the tester needs we might extend here with shards, clusterId, etc...

+  wakuConf.metricsServer = true
+  wakuConf.metricsServerAddress = parseIpAddress("0.0.0.0")
+  wakuConf.metricsServerPort = 8003
+
  if conf.testFunc == TesterFunctionality.SENDER:
    wakuConf.lightpushnode = conf.serviceNode
  else:
@ -108,9 +112,6 @@ when isMainModule:

  wakuConf.rest = false

-  wakuConf.metricsServer = true
-  wakuConf.metricsServerAddress = parseIpAddress("0.0.0.0")
-
  # NOTE: {.threadvar.} is used to make the global variable GC safe for the closure uses it
  # It will always be called from main thread anyway.
  # Ref: https://nim-lang.org/docs/manual.html#threads-gc-safety
--- a/apps/liteprotocoltester/lpt_metrics.nim
+++ b/apps/liteprotocoltester/lpt_metrics.nim
@ -0,0 +1,30 @@
+## Example showing how a resource restricted client may
+## subscribe to messages without relay
+
+import metrics
+
+export metrics
+
+declarePublicGauge lpt_receiver_sender_peer_count, "count of sender peers"
+
+declarePublicCounter lpt_receiver_received_messages_count,
+  "number of messages received per peer", ["peer"]
+
+declarePublicCounter lpt_receiver_received_bytes,
+  "number of received bytes per peer", ["peer"]
+
+declarePublicGauge lpt_receiver_missing_messages_count,
+  "number of missing messages per peer", ["peer"]
+
+declarePublicCounter lpt_receiver_duplicate_messages_count,
+  "number of duplicate messages per peer", ["peer"]
+
+declarePublicGauge lpt_receiver_distinct_duplicate_messages_count,
+  "number of distinct duplicate messages per peer", ["peer"]
+
+declarePublicCounter lpt_publisher_sent_messages_count, "number of messages published"
+
+declarePublicCounter lpt_publisher_failed_messages_count,
+  "number of messages failed to publish per failure cause", ["cause"]
+
+declarePublicCounter lpt_publisher_sent_bytes, "number of total bytes sent"
--- a/apps/liteprotocoltester/monitoring/configuration/dashboards/liter-protocol-test-monitoring.json
+++ b/apps/liteprotocoltester/monitoring/configuration/dashboards/liter-protocol-test-monitoring.json
--- a/apps/liteprotocoltester/monitoring/configuration/dashboards/nwaku-monitoring.json
+++ b/apps/liteprotocoltester/monitoring/configuration/dashboards/nwaku-monitoring.json
--- a/apps/liteprotocoltester/monitoring/configuration/datasources.yaml
+++ b/apps/liteprotocoltester/monitoring/configuration/datasources.yaml
@ -5,7 +5,7 @@ datasources:
    type: prometheus
    access: proxy
    org_id: 1
-    url: http://prometheus:9090
+    url: http://prometheus:9099
    is_default: true
    version: 1
-    editable: true
+    editable: true
--- a/apps/liteprotocoltester/monitoring/configuration/grafana.ini
+++ b/apps/liteprotocoltester/monitoring/configuration/grafana.ini
@ -1,9 +1,11 @@
-instance_name = nwaku dashboard
+instance_name = liteprotocoltester dashboard

 ;[dashboards.json]
 ;enabled = true
 ;path = /home/git/grafana/grafana-dashboards/dashboards

+[server]
+http_port = 3033  

 #################################### Auth ##########################
 [auth]
--- a/apps/liteprotocoltester/monitoring/configuration/pg-exporter-queries.yml
+++ b/apps/liteprotocoltester/monitoring/configuration/pg-exporter-queries.yml
@ -1,284 +0,0 @@
-pg_replication:
-  query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag"
-  master: true
-  metrics:
-    - lag:
-        usage: "GAUGE"
-        description: "Replication lag behind master in seconds"
-
-pg_postmaster:
-  query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
-  master: true
-  metrics:
-    - start_time_seconds:
-        usage: "GAUGE"
-        description: "Time at which postmaster started"
-
-pg_stat_user_tables:
-  query: |
-   SELECT
-     current_database() datname,
-     schemaname,
-     relname,
-     seq_scan,
-     seq_tup_read,
-     idx_scan,
-     idx_tup_fetch,
-     n_tup_ins,
-     n_tup_upd,
-     n_tup_del,
-     n_tup_hot_upd,
-     n_live_tup,
-     n_dead_tup,
-     n_mod_since_analyze,
-     COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum,
-     COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum,
-     COALESCE(last_analyze, '1970-01-01Z') as last_analyze,
-     COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze,
-     vacuum_count,
-     autovacuum_count,
-     analyze_count,
-     autoanalyze_count
-   FROM
-     pg_stat_user_tables
-  metrics:
-    - datname:
-        usage: "LABEL"
-        description: "Name of current database"
-    - schemaname:
-        usage: "LABEL"
-        description: "Name of the schema that this table is in"
-    - relname:
-        usage: "LABEL"
-        description: "Name of this table"
-    - seq_scan:
-        usage: "COUNTER"
-        description: "Number of sequential scans initiated on this table"
-    - seq_tup_read:
-        usage: "COUNTER"
-        description: "Number of live rows fetched by sequential scans"
-    - idx_scan:
-        usage: "COUNTER"
-        description: "Number of index scans initiated on this table"
-    - idx_tup_fetch:
-        usage: "COUNTER"
-        description: "Number of live rows fetched by index scans"
-    - n_tup_ins:
-        usage: "COUNTER"
-        description: "Number of rows inserted"
-    - n_tup_upd:
-        usage: "COUNTER"
-        description: "Number of rows updated"
-    - n_tup_del:
-        usage: "COUNTER"
-        description: "Number of rows deleted"
-    - n_tup_hot_upd:
-        usage: "COUNTER"
-        description: "Number of rows HOT updated (i.e., with no separate index update required)"
-    - n_live_tup:
-        usage: "GAUGE"
-        description: "Estimated number of live rows"
-    - n_dead_tup:
-        usage: "GAUGE"
-        description: "Estimated number of dead rows"
-    - n_mod_since_analyze:
-        usage: "GAUGE"
-        description: "Estimated number of rows changed since last analyze"
-    - last_vacuum:
-        usage: "GAUGE"
-        description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
-    - last_autovacuum:
-        usage: "GAUGE"
-        description: "Last time at which this table was vacuumed by the autovacuum daemon"
-    - last_analyze:
-        usage: "GAUGE"
-        description: "Last time at which this table was manually analyzed"
-    - last_autoanalyze:
-        usage: "GAUGE"
-        description: "Last time at which this table was analyzed by the autovacuum daemon"
-    - vacuum_count:
-        usage: "COUNTER"
-        description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
-    - autovacuum_count:
-        usage: "COUNTER"
-        description: "Number of times this table has been vacuumed by the autovacuum daemon"
-    - analyze_count:
-        usage: "COUNTER"
-        description: "Number of times this table has been manually analyzed"
-    - autoanalyze_count:
-        usage: "COUNTER"
-        description: "Number of times this table has been analyzed by the autovacuum daemon"
-
-pg_statio_user_tables:
-  query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
-  metrics:
-    - datname:
-        usage: "LABEL"
-        description: "Name of current database"
-    - schemaname:
-        usage: "LABEL"
-        description: "Name of the schema that this table is in"
-    - relname:
-        usage: "LABEL"
-        description: "Name of this table"
-    - heap_blks_read:
-        usage: "COUNTER"
-        description: "Number of disk blocks read from this table"
-    - heap_blks_hit:
-        usage: "COUNTER"
-        description: "Number of buffer hits in this table"
-    - idx_blks_read:
-        usage: "COUNTER"
-        description: "Number of disk blocks read from all indexes on this table"
-    - idx_blks_hit:
-        usage: "COUNTER"
-        description: "Number of buffer hits in all indexes on this table"
-    - toast_blks_read:
-        usage: "COUNTER"
-        description: "Number of disk blocks read from this table's TOAST table (if any)"
-    - toast_blks_hit:
-        usage: "COUNTER"
-        description: "Number of buffer hits in this table's TOAST table (if any)"
-    - tidx_blks_read:
-        usage: "COUNTER"
-        description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
-    - tidx_blks_hit:
-        usage: "COUNTER"
-        description: "Number of buffer hits in this table's TOAST table indexes (if any)"
-
-# WARNING: This set of metrics can be very expensive on a busy server as every unique query executed will create an additional time series
-pg_stat_statements:
-  query: "SELECT t2.rolname, t3.datname, queryid, calls, ( total_plan_time + total_exec_time ) / 1000 as total_time_seconds, ( min_plan_time + min_exec_time ) / 1000 as min_time_seconds, ( max_plan_time + max_exec_time ) / 1000 as max_time_seconds, ( mean_plan_time + mean_exec_time ) / 1000 as mean_time_seconds, ( stddev_plan_time + stddev_exec_time )  / 1000 as stddev_time_seconds, rows, shared_blks_hit, shared_blks_read, shared_blks_dirtied, shared_blks_written, local_blks_hit, local_blks_read, local_blks_dirtied, local_blks_written, temp_blks_read, temp_blks_written, blk_read_time / 1000 as blk_read_time_seconds, blk_write_time / 1000 as blk_write_time_seconds FROM pg_stat_statements t1 JOIN pg_roles t2 ON (t1.userid=t2.oid) JOIN pg_database t3 ON (t1.dbid=t3.oid) WHERE t2.rolname != 'rdsadmin' AND queryid IS NOT NULL"
-  master: true
-  metrics:
-    - rolname:
-        usage: "LABEL"
-        description: "Name of user"
-    - datname:
-        usage: "LABEL"
-        description: "Name of database"
-    - queryid:
-        usage: "LABEL"
-        description: "Query ID"
-    - calls:
-        usage: "COUNTER"
-        description: "Number of times executed"
-    - total_time_seconds:
-        usage: "COUNTER"
-        description: "Total time spent in the statement, in milliseconds"
-    - min_time_seconds:
-        usage: "GAUGE"
-        description: "Minimum time spent in the statement, in milliseconds"
-    - max_time_seconds:
-        usage: "GAUGE"
-        description: "Maximum time spent in the statement, in milliseconds"
-    - mean_time_seconds:
-        usage: "GAUGE"
-        description: "Mean time spent in the statement, in milliseconds"
-    - stddev_time_seconds:
-        usage: "GAUGE"
-        description: "Population standard deviation of time spent in the statement, in milliseconds"
-    - rows:
-        usage: "COUNTER"
-        description: "Total number of rows retrieved or affected by the statement"
-    - shared_blks_hit:
-        usage: "COUNTER"
-        description: "Total number of shared block cache hits by the statement"
-    - shared_blks_read:
-        usage: "COUNTER"
-        description: "Total number of shared blocks read by the statement"
-    - shared_blks_dirtied:
-        usage: "COUNTER"
-        description: "Total number of shared blocks dirtied by the statement"
-    - shared_blks_written:
-        usage: "COUNTER"
-        description: "Total number of shared blocks written by the statement"
-    - local_blks_hit:
-        usage: "COUNTER"
-        description: "Total number of local block cache hits by the statement"
-    - local_blks_read:
-        usage: "COUNTER"
-        description: "Total number of local blocks read by the statement"
-    - local_blks_dirtied:
-        usage: "COUNTER"
-        description: "Total number of local blocks dirtied by the statement"
-    - local_blks_written:
-        usage: "COUNTER"
-        description: "Total number of local blocks written by the statement"
-    - temp_blks_read:
-        usage: "COUNTER"
-        description: "Total number of temp blocks read by the statement"
-    - temp_blks_written:
-        usage: "COUNTER"
-        description: "Total number of temp blocks written by the statement"
-    - blk_read_time_seconds:
-        usage: "COUNTER"
-        description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
-    - blk_write_time_seconds:
-        usage: "COUNTER"
-        description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
-
-pg_process_idle:
-  query: |
-    WITH
-      metrics AS (
-        SELECT
-          application_name,
-          SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum,
-          COUNT(*) AS process_idle_seconds_count
-        FROM pg_stat_activity
-        WHERE state = 'idle'
-        GROUP BY application_name
-      ),
-      buckets AS (
-        SELECT
-          application_name,
-          le,
-          SUM(
-            CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le
-              THEN 1
-              ELSE 0
-            END
-          )::bigint AS bucket
-        FROM
-          pg_stat_activity,
-          UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le
-        GROUP BY application_name, le
-        ORDER BY application_name, le
-      )
-    SELECT
-      application_name,
-      process_idle_seconds_sum as seconds_sum,
-      process_idle_seconds_count as seconds_count,
-      ARRAY_AGG(le) AS seconds,
-      ARRAY_AGG(bucket) AS seconds_bucket
-    FROM metrics JOIN buckets USING (application_name)
-    GROUP BY 1, 2, 3
-  metrics:
-    - application_name:
-        usage: "LABEL"
-        description: "Application Name"
-    - seconds:
-        usage: "HISTOGRAM"
-        description: "Idle time of server processes"
-
-pg_tb_stats:
-  query: |
-    select pubsubtopic, count(*) AS messages FROM (SELECT id, array_agg(pubsubtopic ORDER BY pubsubtopic) AS pubsubtopic FROM messages GROUP BY id) sub GROUP BY pubsubtopic ORDER BY pubsubtopic;
-  metrics:
-    - pubsubtopic:
-        usage: "LABEL"
-        description: "pubsubtopic"
-    - messages:
-        usage: "GAUGE"
-        description: "Number of messages for the given pubsub topic"
-
-pg_tb_messages:
-  query: |
-    SELECT
-      COUNT(ID)
-    FROM messages
-  metrics:
-    - count:
-        usage: "GAUGE"
-        description: "Row count in `messages` table"
--- a/apps/liteprotocoltester/monitoring/configuration/postgres-exporter.yml
+++ b/apps/liteprotocoltester/monitoring/configuration/postgres-exporter.yml
@ -1,9 +0,0 @@
-auth_modules:
-  mypostgres:
-    type: userpass
-    userpass:
-      username: postgres
-      password: ${POSTGRES_PASSWORD}
-    options:
-      # options become key=value parameters of the DSN
-      sslmode: disable
--- a/apps/liteprotocoltester/monitoring/prometheus-config.yml
+++ b/apps/liteprotocoltester/monitoring/prometheus-config.yml
@ -5,6 +5,14 @@ global:
    monitor: "Monitoring"

 scrape_configs:
-  - job_name: "nwaku"
+  - job_name: "liteprotocoltester"
    static_configs:
-    - targets: ["nwaku:8003"]
+    - targets: ["lightpush-service:8003",
+                "filter-service:8003",
+                "liteprotocoltester-publishernode-1:8003",
+                "liteprotocoltester-publishernode-2:8003",
+                "liteprotocoltester-publishernode-3:8003",
+                "liteprotocoltester-publishernode-4:8003",
+                "liteprotocoltester-publishernode-5:8003",
+                "liteprotocoltester-publishernode-6:8003",
+                "receivernode:8003"]
--- a/apps/liteprotocoltester/run_service_node.sh
+++ b/apps/liteprotocoltester/run_service_node.sh
@ -19,12 +19,12 @@ echo "STANDALONE: ${STANDALONE}"

 if [ -z "${STANDALONE}" ]; then

-  RETRIES=${RETRIES:=10}
+  RETRIES=${RETRIES:=20}

  while [ -z "${BOOTSTRAP_ENR}" ] && [ ${RETRIES} -ge 0 ]; do
    BOOTSTRAP_ENR=$(wget -qO- http://bootstrap:8645/debug/v1/info --header='Content-Type:application/json' 2> /dev/null | sed 's/.*"enrUri":"\([^"]*\)".*/\1/');
    echo "Bootstrap node not ready, retrying (retries left: ${RETRIES})"
-    sleep 1
+    sleep 3
    RETRIES=$(( $RETRIES - 1 ))
  done

@ -56,6 +56,7 @@ exec /usr/bin/wakunode\
      --discv5-bootstrap-node=${BOOTSTRAP_ENR}\
      --log-level=INFO\
      --metrics-server=True\
+      --metrics-server-port=8003\
      --metrics-server-address=0.0.0.0\
      --nat=extip:${IP}\
      ${PUBSUB}\
--- a/apps/liteprotocoltester/run_tester_node.sh
+++ b/apps/liteprotocoltester/run_tester_node.sh
@ -7,7 +7,6 @@ if test -f .env; then
  . $(pwd)/.env
 fi

-IP=$(ip a | grep "inet " | grep -Fv 127.0.0.1 | sed 's/.*inet \([^/]*\).*/\1/')

 echo "I am a lite-protocol-tester node"

@ -52,10 +51,14 @@ fi
 if [ "${SERIVCE_NODE_ADDR}" = "waku-sim" ]; then
  DO_DETECT_SERVICENODE=1
  SERIVCE_NODE_ADDR=""
+  MY_EXT_IP=$(ip a | grep "inet " | grep -Fv 127.0.0.1 | sed 's/.*inet \([^/]*\).*/\1/')
+else
+  MY_EXT_IP=$(wget -qO- --no-check-certificate https://api4.ipify.org)
 fi

+
 if [ $DO_DETECT_SERVICENODE -eq 1 ]; then
-  RETRIES=${RETRIES:=10}
+  RETRIES=${RETRIES:=20}

  while [ -z "${SERIVCE_NODE_ADDR}" ] && [ ${RETRIES} -ge 0 ]; do
    SERVICE_DEBUG_INFO=$(wget -qO- http://${SERVICENAME}:8645/debug/v1/info --header='Content-Type:application/json' 2> /dev/null);
@ -63,7 +66,7 @@ if [ $DO_DETECT_SERVICENODE -eq 1 ]; then

    SERIVCE_NODE_ADDR=$(wget -qO- http://${SERVICENAME}:8645/debug/v1/info --header='Content-Type:application/json' 2> /dev/null | sed 's/.*"listenAddresses":\["\([^"]*\)".*/\1/');
    echo "Service node not ready, retrying (retries left: ${RETRIES})"
-    sleep 1
+    sleep 3
    RETRIES=$(( $RETRIES - 1 ))
  done

@ -112,10 +115,12 @@ fi
 echo "Running binary: ${BINARY_PATH}"
 echo "Tester node: ${FUNCTION}"
 echo "Using service node: ${SERIVCE_NODE_ADDR}"
+echo "My external IP: ${MY_EXT_IP}"

 exec "${BINARY_PATH}"\
      --log-level=INFO\
      --service-node="${SERIVCE_NODE_ADDR}"\
+      --nat=extip:${MY_EXT_IP}\
      ${DELAY_MESSAGES}\
      ${NUM_MESSAGES}\
      ${PUBSUB}\
@ -125,5 +130,4 @@ exec "${BINARY_PATH}"\
      ${START_PUBLISHING_AFTER}\
      ${MIN_MESSAGE_SIZE}\
      ${MAX_MESSAGE_SIZE}
-      # --nat=extip:${IP}\
      # --config-file=config.toml\
--- a/apps/liteprotocoltester/statistics.nim
+++ b/apps/liteprotocoltester/statistics.nim
@ -5,9 +5,10 @@ import
  chronos/timer as chtimer,
  chronicles,
  chronos,
-  results
+  results,
+  libp2p/peerid

-import ./tester_message
+import ./tester_message, ./lpt_metrics

 type
  ArrivalInfo = object
@ -54,7 +55,9 @@ proc init*(T: type Statistics, expectedMessageCount: int = 1000): T =
  result.received = initTable[uint32, MessageInfo](expectedMessageCount)
  return result

-proc addMessage*(self: var Statistics, msg: ProtocolTesterMessage, msgHash: string) =
+proc addMessage*(
+    self: var Statistics, sender: string, msg: ProtocolTesterMessage, msgHash: string
+) =
  if self.allMessageCount == 0:
    self.allMessageCount = msg.count
    self.firstReceivedIdx = msg.index
@ -70,7 +73,7 @@ proc addMessage*(self: var Statistics, msg: ProtocolTesterMessage, msgHash: stri
      prevIndex: self.helper.prevIndex,
    ),
  )
-
+  lpt_receiver_received_bytes.inc(labelValues = [sender], amount = msg.size.int64)
  if self.received.hasKeyOrPut(msg.index, currentArrived):
    inc(self.duplicateCount)
    self.helper.duplicates.mgetOrPut(msg.index, (msgHash, 0, msg.size)).dupCount.inc()
@ -78,6 +81,10 @@ proc addMessage*(self: var Statistics, msg: ProtocolTesterMessage, msgHash: stri
      index = msg.index,
      hash = msgHash,
      times_duplicated = self.helper.duplicates[msg.index].dupCount
+    lpt_receiver_duplicate_messages_count.inc(labelValues = [sender])
+    lpt_receiver_distinct_duplicate_messages_count.set(
+      labelValues = [sender], value = self.helper.duplicates.len()
+    )
    return

  ## detect misorder arrival and possible lost messages
@ -93,6 +100,10 @@ proc addMessage*(self: var Statistics, msg: ProtocolTesterMessage, msgHash: stri
  self.helper.prevIndex = msg.index
  self.helper.prevArrivedAt = currentArrived.info.arrivedAt
  inc(self.receivedMessages)
+  lpt_receiver_received_messages_count.inc(labelValues = [sender])
+  lpt_receiver_missing_messages_count.set(
+    labelValues = [sender], value = (self.helper.maxIndex - self.receivedMessages).int64
+  )

 proc addMessage*(
    self: var PerPeerStatistics,
@ -103,8 +114,17 @@ proc addMessage*(
  if not self.contains(peerId):
    self[peerId] = Statistics.init()

+  let shortSenderId = block:
+    let senderPeer = PeerId.init(msg.sender)
+    if senderPeer.isErr():
+      msg.sender
+    else:
+      senderPeer.get().shortLog()
+
  discard catch:
-    self[peerId].addMessage(msg, msgHash)
+    self[peerId].addMessage(shortSenderId, msg, msgHash)
+
+  lpt_receiver_sender_peer_count.set(value = self.len)

 proc lossCount*(self: Statistics): uint32 =
  self.helper.maxIndex - self.receivedMessages