Improve selecting of nodes in neighborhood gossip (#1072)

Allow also concurrent neighborhood gossip jobs when seeding data
into the network.
Update Grafana dashboard for two additional metrics regarding
lookups in neighborhood gossip.
This commit is contained in:
Kim De Mey 2022-05-07 13:50:16 +02:00 committed by GitHub
parent 5467abed8f
commit 69366e1880
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 208 additions and 48 deletions

View File

@ -66,7 +66,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -128,7 +128,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -188,7 +188,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -247,7 +247,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -306,7 +306,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -355,7 +355,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -459,7 +459,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -603,7 +603,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -707,7 +707,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -811,7 +811,7 @@
"showUnfilled": true,
"text": {}
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -871,7 +871,7 @@
"showUnfilled": true,
"text": {}
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -923,7 +923,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1036,7 +1036,7 @@
"showUnfilled": true,
"text": {}
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -1094,7 +1094,7 @@
"showUnfilled": true,
"text": {}
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -1146,7 +1146,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1266,7 +1266,7 @@
"showUnfilled": true,
"text": {}
},
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"targets": [
{
"exemplar": true,
@ -1318,7 +1318,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1414,7 +1414,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1518,7 +1518,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1605,7 +1605,7 @@
"y": 48
},
"hiddenSeries": false,
"id": 8,
"id": 46,
"legend": {
"avg": false,
"current": false,
@ -1622,7 +1622,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1633,17 +1633,25 @@
"targets": [
{
"exemplar": true,
"expr": "portal_message_decoding_failures_total",
"expr": "rate(portal_gossip_with_lookup_total[$__rate_interval])",
"interval": "",
"legendFormat": "",
"legendFormat": "portal_gossip_with_lookup[{{protocol_id}}]",
"refId": "A"
},
{
"exemplar": true,
"expr": "rate(portal_gossip_without_lookup_total[$__rate_interval])",
"hide": false,
"interval": "",
"legendFormat": "portal_gossip_without_lookup[{{protocol_id}}]",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Portal message decoding failures",
"title": "Neighborhood gossip node lookups",
"tooltip": {
"shared": true,
"sort": 0,
@ -1659,7 +1667,7 @@
},
"yaxes": [
{
"$$hashKey": "object:595",
"$$hashKey": "object:97",
"format": "short",
"label": null,
"logBase": 1,
@ -1668,7 +1676,7 @@
"show": true
},
{
"$$hashKey": "object:596",
"$$hashKey": "object:98",
"format": "short",
"label": null,
"logBase": 1,
@ -1718,7 +1726,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1786,6 +1794,102 @@
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 56
},
"hiddenSeries": false,
"id": 8,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "portal_message_decoding_failures_total",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Portal message decoding failures",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:595",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:596",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
@ -1822,7 +1926,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1926,7 +2030,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.9",
"pluginVersion": "7.5.11",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1987,7 +2091,7 @@
}
}
],
"refresh": false,
"refresh": "5s",
"schemaVersion": 27,
"style": "dark",
"tags": [],
@ -2002,5 +2106,5 @@
"timezone": "",
"title": "Fluffy Dashboard",
"uid": "iWQQPuPnkadsf",
"version": 4
"version": 7
}

View File

@ -59,6 +59,13 @@ declareCounter portal_gossip_offers_successful,
declareCounter portal_gossip_offers_failed,
"Portal wire protocol failed content offers from neighborhood gossip",
labels = ["protocol_id"]
declareCounter portal_gossip_with_lookup,
"Portal wire protocol neighborhood gossip that required a node lookup",
labels = ["protocol_id"]
declareCounter portal_gossip_without_lookup,
"Portal wire protocol neighborhood gossip that did not require a node lookup",
labels = ["protocol_id"]
# Note: These metrics are to get some idea on how many enrs are send on average.
# Relevant issue: https://github.com/ethereum/portal-network-specs/issues/136
@ -1014,19 +1021,49 @@ proc neighborhoodGossip*(
let contentId = contentIdOpt.get()
# Doing an lookup over the network to get the very closest nodes to the
# content, instead of looking only at our own routing table. This should give
# a bigger rate of success in case the content is not known yet and avoid
# data being stopped in its propagation. However, perhaps this causes issues
# in data getting propagated in a wider id range.
let closestNodes = await p.lookup(NodeId(contentId))
# For selecting the closest nodes to whom to gossip the content a mixed
# approach is taken:
# 1. Select the closest neighbours in the routing table
# 2. Check if the radius is known for these these nodes and whether they are
# in range of the content to be offered.
# 3. If more than n (= 4) nodes are in range, offer these nodes the content
# (max nodes set at 8).
# 4. If less than n nodes are in range, do a node lookup, and offer the nodes
# returned from the lookup the content (max nodes set at 8)
#
# This should give a bigger rate of success and avoid the data being stopped
# in its propagation than when looking only for nodes in the own routing
# table, but at the same time avoid unnecessary node lookups.
# It might still cause issues in data getting propagated in a wider id range.
# Selecting closest 8 nodes to offer data
for node in closestNodes[0..<min(closestNodes.len, 8)]:
# Note: opportunistically not checking if the radius of the node is known
# and thus if the node is in radius with the content.
let req = OfferRequest(dst: node, kind: Direct, contentList: contentList)
await p.offerQueue.addLast(req)
const maxGossipNodes = 8
let closestLocalNodes = p.routingTable.neighbours(
NodeId(contentId), k = 16, seenOnly = true)
var gossipNodes: seq[Node]
for node in closestLocalNodes:
let radius = p.radiusCache.get(node.id)
if radius.isSome():
if p.inRange(node.id, radius.unsafeGet(), contentId):
gossipNodes.add(node)
if gossipNodes.len >= 8: # use local nodes for gossip
portal_gossip_without_lookup.inc(labelValues = [$p.protocolId])
for node in gossipNodes[0..<min(gossipNodes.len, maxGossipNodes)]:
let req = OfferRequest(dst: node, kind: Direct, contentList: contentList)
await p.offerQueue.addLast(req)
else: # use looked up nodes for gossip
portal_gossip_with_lookup.inc(labelValues = [$p.protocolId])
let closestNodes = await p.lookup(NodeId(contentId))
for node in closestNodes[0..<min(closestNodes.len, maxGossipNodes)]:
# Note: opportunistically not checking if the radius of the node is known
# and thus if the node is in radius with the content. Reason is, these
# should really be the closest nodes in the DHT, and thus are most likely
# going to be in range of the requested content.
let req = OfferRequest(dst: node, kind: Direct, contentList: contentList)
await p.offerQueue.addLast(req)
proc processContent(
stream: PortalStream, contentKeys: ContentKeysList, content: seq[byte])

View File

@ -151,6 +151,22 @@ proc populateHistoryDb*(
proc propagateHistoryDb*(
p: PortalProtocol, dataFile: string, verify = false):
Future[Result[void, string]] {.async.} =
const concurrentGossips = 20
var gossipQueue =
newAsyncQueue[(ContentKeysList, seq[byte])](concurrentGossips)
var gossipWorkers: seq[Future[void]]
proc gossipWorker(p: PortalProtocol) {.async.} =
while true:
let (keys, content) = await gossipQueue.popFirst()
await p.neighborhoodGossip(keys, content)
for i in 0 ..< concurrentGossips:
gossipWorkers.add(gossipWorker(p))
let blockData = readBlockDataTable(dataFile)
if blockData.isOk():
@ -162,12 +178,9 @@ proc propagateHistoryDb*(
if p.inRange(contentId):
p.contentDB.put(contentId, value[1])
await p.neighborhoodGossip(
ContentKeysList(@[encode(value[0])]), value[1])
await gossipQueue.addLast(
(ContentKeysList(@[encode(value[0])]), value[1]))
# Need to be sure that all offers where started. TODO: this is not great.
while not p.offerQueueEmpty():
await sleepAsync(500.milliseconds)
return ok()
else:
return err(blockData.error)

View File

@ -188,6 +188,12 @@ procSuite "Portal testnet tests":
check (await clients[0].portal_history_propagate(dataFile))
await clients[0].close()
# Note: Sleeping to make a test work is never great. Here it is needed
# because the data needs to propagate over the nodes. What one could do is
# add a json-rpc debug proc that returns whether the offer queue is empty or
# not. And then poll every node until all nodes have an empty queue.
await sleepAsync(10.seconds)
let blockData = readBlockDataTable(dataFile)
check blockData.isOk()