expose more libp2p performance and queuing metrics (#678)

* gossipsub: adding duplicate arrival metrics

Adding counters for received deduplicated messages and for
duplicates recognized by the seen cache. Note that duplicates that
are not recognized (arrive after seenTTL) are not counted as
duplicates here either.

* gossipsub: adding mcache (message cache for responding IWANT) stats

It is generally assumed that IWANT messages arrive when mcache still
has the message. These stats are to verify this assumption.

* libp2p: adding internal TX queuing stats

Messages are queued in TX before getting written on the stream,
but we have no statistics about these queues. This patch adds
some queue length and queuing time related statistics.

* adding Grafana libp2p dashboard

Adding Grafana dashboard with newly exposed metrics.

* enable libp2p_mplex_metrics in nimble test

Signed-off-by: Csaba Kiraly <csaba.kiraly@gmail.com>
This commit is contained in:
Csaba Kiraly 2022-04-06 16:00:24 +02:00 committed by GitHub
parent 868ecab54f
commit 9973b9466d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 1535 additions and 3 deletions

View File

@ -27,7 +27,7 @@ const nimflags =
proc runTest(filename: string, verify: bool = true, sign: bool = true, proc runTest(filename: string, verify: bool = true, sign: bool = true,
moreoptions: string = "") = moreoptions: string = "") =
var excstr = "nim c --opt:speed -d:debug -d:libp2p_agents_metrics -d:libp2p_protobuf_metrics -d:libp2p_network_protocols_metrics " var excstr = "nim c --opt:speed -d:debug -d:libp2p_agents_metrics -d:libp2p_protobuf_metrics -d:libp2p_network_protocols_metrics -d:libp2p_mplex_metrics "
excstr.add(" " & getEnv("NIMFLAGS") & " ") excstr.add(" " & getEnv("NIMFLAGS") & " ")
excstr.add(" " & nimflags & " ") excstr.add(" " & nimflags & " ")
excstr.add(" -d:libp2p_pubsub_sign=" & $sign) excstr.add(" -d:libp2p_pubsub_sign=" & $sign)

View File

@ -21,6 +21,12 @@ export connection
logScope: logScope:
topics = "libp2p mplexchannel" topics = "libp2p mplexchannel"
when defined(libp2p_mplex_metrics):
declareHistogram libp2p_mplex_qlen, "message queue length",
buckets = [0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0]
declareCounter libp2p_mplex_qlenclose, "closed because of max queuelen"
declareHistogram libp2p_mplex_qtime, "message queuing time"
when defined(libp2p_network_protocols_metrics): when defined(libp2p_network_protocols_metrics):
declareCounter libp2p_protocols_bytes, "total sent or received bytes", ["protocol", "direction"] declareCounter libp2p_protocols_bytes, "total sent or received bytes", ["protocol", "direction"]
@ -187,6 +193,8 @@ proc prepareWrite(s: LPChannel, msg: seq[byte]): Future[void] {.async.} =
if s.writes >= MaxWrites: if s.writes >= MaxWrites:
debug "Closing connection, too many in-flight writes on channel", debug "Closing connection, too many in-flight writes on channel",
s, conn = s.conn, writes = s.writes s, conn = s.conn, writes = s.writes
when defined(libp2p_mplex_metrics):
libp2p_mplex_qlenclose.inc()
await s.reset() await s.reset()
await s.conn.close() await s.conn.close()
return return
@ -201,8 +209,14 @@ proc completeWrite(
try: try:
s.writes += 1 s.writes += 1
when defined(libp2p_mplex_metrics):
libp2p_mplex_qlen.observe(s.writes.int64 - 1)
libp2p_mplex_qtime.time:
await fut await fut
when defined(libp2p_network_protocols_metrics): else:
await fut
when defined(libp2p_network_protocol_metrics):
if s.tag.len > 0: if s.tag.len > 0:
libp2p_protocols_bytes.inc(msgLen.int64, labelValues=[s.tag, "out"]) libp2p_protocols_bytes.inc(msgLen.int64, labelValues=[s.tag, "out"])

View File

@ -38,6 +38,8 @@ logScope:
declareCounter(libp2p_gossipsub_failed_publish, "number of failed publish") declareCounter(libp2p_gossipsub_failed_publish, "number of failed publish")
declareCounter(libp2p_gossipsub_invalid_topic_subscription, "number of invalid topic subscriptions that happened") declareCounter(libp2p_gossipsub_invalid_topic_subscription, "number of invalid topic subscriptions that happened")
declareCounter(libp2p_gossipsub_duplicate_during_validation, "number of duplicates received during message validation") declareCounter(libp2p_gossipsub_duplicate_during_validation, "number of duplicates received during message validation")
declareCounter(libp2p_gossipsub_duplicate, "number of duplicates received")
declareCounter(libp2p_gossipsub_received, "number of messages received (deduplicated)")
proc init*(_: type[GossipSubParams]): GossipSubParams = proc init*(_: type[GossipSubParams]): GossipSubParams =
GossipSubParams( GossipSubParams(
@ -385,9 +387,13 @@ method rpcHandler*(g: GossipSub,
g.validationSeen.withValue(msgIdSalted, seen): seen[].incl(peer) g.validationSeen.withValue(msgIdSalted, seen): seen[].incl(peer)
libp2p_gossipsub_duplicate.inc()
# onto the next message # onto the next message
continue continue
libp2p_gossipsub_received.inc()
# avoid processing messages we are not interested in # avoid processing messages we are not interested in
if msg.topicIDs.allIt(it notin g.topics): if msg.topicIDs.allIt(it notin g.topics):
debug "Dropping message of topic without subscription", msgId = shortLog(msgId), peer debug "Dropping message of topic without subscription", msgId = shortLog(msgId), peer

View File

@ -25,6 +25,7 @@ declareGauge(libp2p_gossipsub_no_peers_topics, "number of topics in mesh with no
declareGauge(libp2p_gossipsub_low_peers_topics, "number of topics in mesh with at least one but below dlow peers") declareGauge(libp2p_gossipsub_low_peers_topics, "number of topics in mesh with at least one but below dlow peers")
declareGauge(libp2p_gossipsub_healthy_peers_topics, "number of topics in mesh with at least dlow peers (but below dhigh)") declareGauge(libp2p_gossipsub_healthy_peers_topics, "number of topics in mesh with at least dlow peers (but below dhigh)")
declareCounter(libp2p_gossipsub_above_dhigh_condition, "number of above dhigh pruning branches ran", labels = ["topic"]) declareCounter(libp2p_gossipsub_above_dhigh_condition, "number of above dhigh pruning branches ran", labels = ["topic"])
declareSummary(libp2p_gossipsub_mcache_hit, "ratio of successful IWANT message cache lookups")
proc grafted*(g: GossipSub, p: PubSubPeer, topic: string) {.raises: [Defect].} = proc grafted*(g: GossipSub, p: PubSubPeer, topic: string) {.raises: [Defect].} =
g.withPeerStats(p.peerId) do (stats: var PeerStats): g.withPeerStats(p.peerId) do (stats: var PeerStats):
@ -276,12 +277,15 @@ proc handleIWant*(g: GossipSub,
trace "peer sent iwant", peer, messageID = mid trace "peer sent iwant", peer, messageID = mid
let msg = g.mcache.get(mid) let msg = g.mcache.get(mid)
if msg.isSome: if msg.isSome:
libp2p_gossipsub_mcache_hit.observe(1)
# avoid spam # avoid spam
if peer.iWantBudget > 0: if peer.iWantBudget > 0:
messages.add(msg.get()) messages.add(msg.get())
dec peer.iWantBudget dec peer.iWantBudget
else: else:
break break
else:
libp2p_gossipsub_mcache_hit.observe(0)
return messages return messages
proc commitMetrics(metrics: var MeshMetrics) {.raises: [Defect].} = proc commitMetrics(metrics: var MeshMetrics) {.raises: [Defect].} =

File diff suppressed because it is too large Load Diff