Add concurrency to the content offers of neighborhoodGossip proc (#1027)

* Add concurrency to the content offers of neighborhoodGossip proc And remove some whitespace * Remove more whitespace and adjust for 80 char line limit * Update fluffy grafana dashboard to include gossip offer results
2022-04-01 18:01:50 +02:00 · 2022-04-01 18:01:50 +02:00 · 9d656e99c4
parent 84ff179cd9
commit 9d656e99c4
6 changed files with 299 additions and 36 deletions
--- a/fluffy/grafana/fluffy_grafana_dashboard.json
+++ b/fluffy/grafana/fluffy_grafana_dashboard.json
@ -16,7 +16,7 @@
  "editable": true,
  "gnetId": null,
  "graphTooltip": 0,
-  "id": 12,
+  "id": 13,
  "links": [],
  "panels": [
    {
@ -1397,7 +1397,7 @@
        "y": 40
      },
      "hiddenSeries": false,
-      "id": 8,
+      "id": 44,
      "legend": {
        "avg": false,
        "current": false,
@ -1425,17 +1425,25 @@
      "targets": [
        {
          "exemplar": true,
-          "expr": "portal_message_decoding_failures_total",
+          "expr": "rate(portal_gossip_offers_successful_total[$__rate_interval])",
          "interval": "",
-          "legendFormat": "",
+          "legendFormat": "portal_gossip_offers_successful[{{protocol_id}}]",
          "refId": "A"
+        },
+        {
+          "exemplar": true,
+          "expr": "rate(portal_gossip_offers_failed_total[$__rate_interval])",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "portal_gossip_offers_failed[{{protocol_id}}]",
+          "refId": "B"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
-      "title": "Portal message decoding failures",
+      "title": "Neighborhood gossip content offers",
      "tooltip": {
        "shared": true,
        "sort": 0,
@ -1451,7 +1459,7 @@
      },
      "yaxes": [
        {
-          "$$hashKey": "object:595",
+          "$$hashKey": "object:4139",
          "format": "short",
          "label": null,
          "logBase": 1,
@ -1460,7 +1468,7 @@
          "show": true
        },
        {
-          "$$hashKey": "object:596",
+          "$$hashKey": "object:4140",
          "format": "short",
          "label": null,
          "logBase": 1,
@ -1578,6 +1586,102 @@
        "alignLevel": null
      }
    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "fieldConfig": {
+        "defaults": {},
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 48
+      },
+      "hiddenSeries": false,
+      "id": 8,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.5.9",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": true,
+          "expr": "portal_message_decoding_failures_total",
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Portal message decoding failures",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:595",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "$$hashKey": "object:596",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
    {
      "aliasColors": {},
      "bars": false,
@ -1640,7 +1744,9 @@
        }
      ],
      "thresholds": [],
+      "timeFrom": null,
      "timeRegions": [],
+      "timeShift": null,
      "title": "uTP outgoing connections",
      "tooltip": {
        "shared": true,
@ -1742,7 +1848,9 @@
        }
      ],
      "thresholds": [],
+      "timeFrom": null,
      "timeRegions": [],
+      "timeShift": null,
      "title": "uTP Packets",
      "tooltip": {
        "shared": true,
@ -1781,9 +1889,105 @@
        "align": false,
        "alignLevel": null
      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "fieldConfig": {
+        "defaults": {},
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 69
+      },
+      "hiddenSeries": false,
+      "id": 42,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.5.9",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": true,
+          "expr": "utp_established_connections",
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
        }
      ],
-  "refresh": "5s",
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "uTP established connections",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:3811",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "$$hashKey": "object:3812",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    }
+  ],
+  "refresh": false,
  "schemaVersion": 27,
  "style": "dark",
  "tags": [],
@ -1797,6 +2001,6 @@
  "timepicker": {},
  "timezone": "",
  "title": "Fluffy Dashboard",
-  "uid": "iWQQPuPnk",
-  "version": 9
+  "uid": "iWQQPuPnkadsf",
+  "version": 4
 }
--- a/fluffy/network/history/history_network.nim
+++ b/fluffy/network/history/history_network.nim
@ -38,7 +38,9 @@ func encodeKey(k: ContentKey): (ByteList, ContentId) =
  let keyEncoded = encode(k)
  return (keyEncoded, toContentId(keyEncoded))

-func getEncodedKeyForContent(cType: ContentType, chainId: uint16, hash: BlockHash): (ByteList, ContentId) =
+func getEncodedKeyForContent(
+    cType: ContentType, chainId: uint16, hash: BlockHash):
+    (ByteList, ContentId) =
  let contentKeyType = ContentKeyType(chainId: chainId, blockHash: hash)

  let contentKey =
@ -52,15 +54,16 @@ func getEncodedKeyForContent(cType: ContentType, chainId: uint16, hash: BlockHas

  return encodeKey(contentKey)

-proc validateHeaderBytes*(bytes: seq[byte], hash: BlockHash): Option[BlockHeader] =
+proc validateHeaderBytes*(
+    bytes: seq[byte], hash: BlockHash): Option[BlockHeader] =
  try:
    var rlp = rlpFromBytes(bytes)

    let blockHeader = rlp.read(BlockHeader)

    if not (blockHeader.blockHash() == hash):
-      # TODO: Header with different hash than expected maybe we should punish peer which sent
-      # us this ?
+      # TODO: Header with different hash than expecte, maybe we should punish
+      # peer which sent us this ?
      return none(BlockHeader)

    return some(blockHeader)
@ -69,7 +72,9 @@ proc validateHeaderBytes*(bytes: seq[byte], hash: BlockHash): Option[BlockHeader
    # TODO add some logging about failed decoding
    return none(BlockHeader)

-proc validateBodyBytes*(bytes: seq[byte], txRoot: KeccakHash, ommersHash: KeccakHash): Option[BlockBody] =
+proc validateBodyBytes*(
+    bytes: seq[byte], txRoot: KeccakHash, ommersHash: KeccakHash):
+    Option[BlockBody] =
  try:
    var rlp = rlpFromBytes(bytes)

@ -80,7 +85,8 @@ proc validateBodyBytes*(bytes: seq[byte], txRoot: KeccakHash, ommersHash: Keccak

    if txRoot != calculatedTxRoot or ommersHash != calculatedOmmersHash:
      # we got block body (bundle of transactions and uncles) which do not match
-      # header. For now just ignore it, but maybe we should penalize peer sending us such data?
+      # header. For now just ignore it, but maybe we should penalize peer
+      # sending us such data?
      return none(BlockBody)

    return some(blockBody)
@ -89,7 +95,8 @@ proc validateBodyBytes*(bytes: seq[byte], txRoot: KeccakHash, ommersHash: Keccak
    # TODO add some logging about failed decoding
    return none(BlockBody)

-proc getContentFromDb(h: HistoryNetwork, T: type, contentId: ContentId): Option[T] = 
+proc getContentFromDb(
+    h: HistoryNetwork, T: type, contentId: ContentId): Option[T] =
  if h.portalProtocol.inRange(contentId):
    let contentFromDB = h.contentDB.get(contentId)
    if contentFromDB.isSome():
@ -98,14 +105,17 @@ proc getContentFromDb(h: HistoryNetwork, T: type, contentId: ContentId): Option[
        let content = rlp.read(T)
        return some(content)
      except CatchableError as e:
-        # Content in db should always have valid formatting, so this should not happen
+        # Content in db should always have valid formatting, so this should not
+        # happen
        raiseAssert(e.msg)
    else:
      return none(T)
  else:
    return none(T)

-proc getBlockHeader*(h: HistoryNetwork, chainId: uint16, hash: BlockHash): Future[Option[BlockHeader]] {.async.} = 
+proc getBlockHeader*(
+    h: HistoryNetwork, chainId: uint16, hash: BlockHash):
+    Future[Option[BlockHeader]] {.async.} =
  let (keyEncoded, contentId) = getEncodedKeyForContent(blockHeader, chainId, hash)

  let maybeHeaderFromDb = h.getContentFromDb(BlockHeader, contentId)
@ -128,7 +138,9 @@ proc getBlockHeader*(h: HistoryNetwork, chainId: uint16, hash: BlockHash): Futur

  return maybeHeader

-proc getBlock*(h: HistoryNetwork, chainId: uint16, hash: BlockHash): Future[Option[Block]] {.async.} = 
+proc getBlock*(
+    h: HistoryNetwork, chainId: uint16, hash: BlockHash):
+    Future[Option[Block]] {.async.} =
  let maybeHeader = await h.getBlockHeader(chainId, hash)

  if maybeHeader.isNone():
--- a/fluffy/network/wire/portal_protocol.nim
+++ b/fluffy/network/wire/portal_protocol.nim
@ -53,6 +53,12 @@ declareHistogram portal_content_keys_offered,
 declareHistogram portal_content_keys_accepted,
  "Portal wire protocol amount of content keys per accept message received",
  labels = ["protocol_id"], buckets = contentKeysBuckets
+declareCounter portal_gossip_offers_successful,
+  "Portal wire protocol successful content offers from neighborhood gossip",
+  labels = ["protocol_id"]
+declareCounter portal_gossip_offers_failed,
+  "Portal wire protocol failed content offers from neighborhood gossip",
+  labels = ["protocol_id"]

 # Note: These metrics are to get some idea on how many enrs are send on average.
 # Relevant issue: https://github.com/ethereum/portal-network-specs/issues/136
@ -93,6 +99,20 @@ const
    16 # HMAC
  discv5MaxSize = 1280

+  # These are the concurrent offers per Portal wire protocol that is running.
+  # Using the `offerQueue` allows for limiting the amount of offers send and
+  # thus how many streams can be started.
+  # TODO:
+  # More thought needs to go into this as it is currently on a per network
+  # basis. Keep it simple like that? Or limit it better at the stream transport
+  # level? In the latter case, this might still need to be checked/blocked at
+  # the very start of sending the offer, because blocking/waiting too long
+  # between the received accept message and actually starting the stream and
+  # sending data could give issues due to timeouts on the other side.
+  # And then there are still limits to be applied also for FindContent and the
+  # incoming directions.
+  concurrentOffers = 50
+
 type
  ToContentIdHandler* =
    proc(contentKey: ByteList): Option[ContentId] {.raises: [Defect], gcsafe.}
@ -114,6 +134,8 @@ type
    revalidateLoop: Future[void]
    stream*: PortalStream
    radiusCache: RadiusCache
+    offerQueue: AsyncQueue[(Node, ContentKeysList)]
+    offerWorkers: seq[Future[void]]

  PortalResult*[T] = Result[T, cstring]

@ -362,7 +384,8 @@ proc new*(T: type PortalProtocol,
    toContentId: toContentId,
    dataRadius: dataRadius,
    bootstrapRecords: @bootstrapRecords,
-    radiusCache: RadiusCache.init(256))
+    radiusCache: RadiusCache.init(256),
+    offerQueue: newAsyncQueue[(Node, ContentKeysList)](concurrentOffers))

  proto.baseProtocol.registerTalkProtocol(@(proto.protocolId), proto).expect(
    "Only one protocol should have this id")
@ -606,6 +629,19 @@ proc offer*(p: PortalProtocol, dst: Node, contentKeys: ContentKeysList):
  else:
    return err("No accept response")

+proc offerWorker(p: PortalProtocol) {.async.} =
+  while true:
+    let (node, contentKeys) = await p.offerQueue.popFirst()
+
+    let res = await p.offer(node, contentKeys)
+    if res.isOk():
+      portal_gossip_offers_successful.inc(labelValues = [$p.protocolId])
+    else:
+      portal_gossip_offers_failed.inc(labelValues = [$p.protocolId])
+
+proc offerQueueEmpty*(p: PortalProtocol): bool =
+  p.offerQueue.empty()
+
 proc neighborhoodGossip*(p: PortalProtocol, contentKeys: ContentKeysList) {.async.} =
  let contentKey = contentKeys[0] # for now only 1 item is considered
  let contentIdOpt = p.toContentId(contentKey)
@ -622,8 +658,7 @@ proc neighborhoodGossip*(p: PortalProtocol, contentKeys: ContentKeysList) {.asyn
    NodeId(contentId), k = 6, seenOnly = false)

  for node in closestNodes:
-    # Not doing anything if this fails
-    discard await p.offer(node, contentKeys)
+    await p.offerQueue.addLast((node, contentKeys))

 proc processContent(
    stream: PortalStream, contentKeys: ContentKeysList, content: seq[byte])
@ -947,12 +982,19 @@ proc start*(p: PortalProtocol) =
  p.refreshLoop = refreshLoop(p)
  p.revalidateLoop = revalidateLoop(p)

+  for i in 0 ..< concurrentOffers:
+    p.offerWorkers.add(offerWorker(p))
+
 proc stop*(p: PortalProtocol) =
  if not p.revalidateLoop.isNil:
    p.revalidateLoop.cancel()
  if not p.refreshLoop.isNil:
    p.refreshLoop.cancel()

+  for worker in p.offerWorkers:
+    worker.cancel()
+  p.offerWorkers = @[]
+
 proc resolve*(p: PortalProtocol, id: NodeId): Future[Option[Node]] {.async.} =
  ## Resolve a `Node` based on provided `NodeId`.
  ##
--- a/fluffy/populate_db.nim
+++ b/fluffy/populate_db.nim
@ -154,6 +154,11 @@ proc propagateHistoryDb*(
        # TODO: This call will get the content we just stored in the db, so it
        # might be an improvement to directly pass it.
        await p.neighborhoodGossip(ContentKeysList(@[encode(value[0])]))
+
+    # Need to be sure that all offers where started. TODO: this is not great.
+    while not p.offerQueueEmpty():
+      error "WAITING FOR OFFER QUEUE EMPTY"
+      await sleepAsync(500.milliseconds)
    return ok()
  else:
    return err(blockData.error)