Beacon sync blocks queue mgmnt update (#3108)

* Update comments and colouring of example metrics display

* Update comment for import/download serialisation flag

details
   When importing starts while peers are actively downloading, the
   system tends to loose download peers, most probably due to high
   system activity. By this behaviour, there will be extra waiting
   delays for finding and connecting to new download peers.

   For this reason, importing and downloading is serialised. Downloading
   does not take place while importing.

* Update comment on import start condition

details
  When importing starts while peers are actively downloading, the
  system tends to loose download peers, most probably due to high
  system activity. By this behaviour, there will be extra waiting
  delays for finding and connecting to new download peers.

  For this reason, importing starts not before the staged blocks queue
  is filled up. The ramp up time to fill up is a fraction of the
  potential waiting time when losing peers.

* Update comment on header or block fetch conversation via eth/XX

* Increase staged blocks queue

why
  Better overall throughput for slightly increased memory usage

* Reduce header queue limits

why
  In regular circumstances, the header queue has a few records most of
  time. Longer queues appear with unwieldy peers (bogus data, timeouts,
  etc.) if they happen to lock the lowest record so preventing from
  temporarily serialising the queue.
This commit is contained in:
Jordan Hrycaj 2025-02-25 12:48:46 +00:00 committed by GitHub
parent b4226b66e4
commit 0f89c1f901
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 88 additions and 59 deletions

View File

@ -134,25 +134,8 @@
"sort": "none" "sort": "none"
} }
}, },
"pluginVersion": "11.5.1", "pluginVersion": "11.5.2",
"targets": [ "targets": [
{
"datasource": {
"type": "prometheus",
"uid": "bdvimuocv3fggc"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "nec_execution_head{instance=\"172.16.210.1:9099\", job=\"mainnet\"}",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "nec_execution_head",
"range": true,
"refId": "A",
"useBackend": false
},
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -169,6 +152,23 @@
"range": true, "range": true,
"refId": "B", "refId": "B",
"useBackend": false "useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "bdvimuocv3fggc"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "nec_execution_head{instance=\"172.16.210.1:9099\", job=\"mainnet\"}",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "nec_execution_head",
"range": true,
"refId": "A",
"useBackend": false
} }
], ],
"title": "Base vs Latest", "title": "Base vs Latest",
@ -194,7 +194,7 @@
"barAlignment": 0, "barAlignment": 0,
"barWidthFactor": 0.6, "barWidthFactor": 0.6,
"drawStyle": "line", "drawStyle": "line",
"fillOpacity": 25, "fillOpacity": 50,
"gradientMode": "opacity", "gradientMode": "opacity",
"hideFrom": { "hideFrom": {
"legend": false, "legend": false,
@ -244,7 +244,7 @@
{ {
"id": "color", "id": "color",
"value": { "value": {
"fixedColor": "dark-blue", "fixedColor": "super-light-blue",
"mode": "fixed" "mode": "fixed"
} }
} }
@ -287,8 +287,26 @@
"sort": "none" "sort": "none"
} }
}, },
"pluginVersion": "11.5.1", "pluginVersion": "11.5.2",
"targets": [ "targets": [
{
"datasource": {
"type": "prometheus",
"uid": "bdvimuocv3fggc"
},
"disableTextWrap": false,
"editorMode": "builder",
"exemplar": false,
"expr": "nec_sync_peers{instance=\"172.16.210.1:9099\", job=\"mainnet\"}",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "nec_sync_peers",
"range": true,
"refId": "B",
"useBackend": false
},
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -307,24 +325,6 @@
"range": true, "range": true,
"refId": "A", "refId": "A",
"useBackend": false "useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "bdvimuocv3fggc"
},
"disableTextWrap": false,
"editorMode": "builder",
"exemplar": false,
"expr": "nec_sync_peers{instance=\"172.16.210.1:9099\", job=\"mainnet\"}",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "nec_sync_peers",
"range": true,
"refId": "B",
"useBackend": false
} }
], ],
"title": "Buddies vs Import Block Lists", "title": "Buddies vs Import Block Lists",
@ -350,8 +350,8 @@
"barAlignment": 0, "barAlignment": 0,
"barWidthFactor": 0.6, "barWidthFactor": 0.6,
"drawStyle": "line", "drawStyle": "line",
"fillOpacity": 27, "fillOpacity": 50,
"gradientMode": "none", "gradientMode": "opacity",
"hideFrom": { "hideFrom": {
"legend": false, "legend": false,
"tooltip": false, "tooltip": false,
@ -443,7 +443,7 @@
"sort": "none" "sort": "none"
} }
}, },
"pluginVersion": "11.5.1", "pluginVersion": "11.5.2",
"targets": [ "targets": [
{ {
"datasource": { "datasource": {
@ -625,7 +625,7 @@
"sort": "none" "sort": "none"
} }
}, },
"pluginVersion": "11.5.1", "pluginVersion": "11.5.2",
"targets": [ "targets": [
{ {
"datasource": { "datasource": {
@ -761,7 +761,7 @@
"sort": "none" "sort": "none"
} }
}, },
"pluginVersion": "11.5.1", "pluginVersion": "11.5.2",
"targets": [ "targets": [
{ {
"datasource": { "datasource": {
@ -913,7 +913,7 @@
"sort": "none" "sort": "none"
} }
}, },
"pluginVersion": "11.5.1", "pluginVersion": "11.5.2",
"targets": [ "targets": [
{ {
"datasource": { "datasource": {
@ -946,13 +946,13 @@
"list": [] "list": []
}, },
"time": { "time": {
"from": "now-1h", "from": "2025-02-25T11:03:15.000Z",
"to": "now" "to": "now"
}, },
"timepicker": {}, "timepicker": {},
"timezone": "browser", "timezone": "browser",
"title": "Mainnet Sync", "title": "Mainnet Sync",
"uid": "beavm2j26dvr4b", "uid": "beavm2j26dvr4b",
"version": 45, "version": 49,
"weekStart": "" "weekStart": ""
} }

View File

@ -283,7 +283,7 @@ For the *Era1*/*Era* pre-load (if any) the following extra options apply to
To start syncing, the following additional options apply to *nimbus*: To start syncing, the following additional options apply to *nimbus*:
--debug-beacon-blocks-queue-hwm=1000 --debug-beacon-blocks-queue-hwm=1500
--debug-rocksdb-max-open-files=384 --debug-rocksdb-max-open-files=384
--debug-rocksdb-write-buffer-size=50331648 --debug-rocksdb-write-buffer-size=50331648
--debug-rocksdb-block-cache-size=1073741824 --debug-rocksdb-block-cache-size=1073741824

View File

@ -118,10 +118,11 @@ proc runDaemon*(
if ctx.blocksStagedCanImportOk(): if ctx.blocksStagedCanImportOk():
block: block:
# Set advisory flag telling that a slow/long running process will take # Set flag informing peers to go into idle mode while importing takes
# place. So there might be some peers active. If they are waiting for # place. It has been observed that importing blocks and downloading
# a message reply, this will most probably time out as all processing # at the same time does not work very well, most probably due to high
# power is usurped by the import task here. # system activity while importing. Peers will get lost pretty soon after
# downloading starts if they continue downloading.
ctx.pool.blockImportOk = true ctx.pool.blockImportOk = true
defer: ctx.pool.blockImportOk = false defer: ctx.pool.blockImportOk = false

View File

@ -147,7 +147,21 @@ func blocksStagedCanImportOk*(ctx: BeaconCtxRef): bool =
if ctx.pool.nBuddies == 0: if ctx.pool.nBuddies == 0:
return true return true
# Start importing if the queue is filled up enough # If importing starts while peers are actively downloading, the system
# tends to loose download peers, most probably due to high system
# activity.
#
# * Typical download time to download and stage a queue record ~15s (raw
# download time typically ranges ~30ms ..~10s)
#
# * Anecdotal time to connect to a new download peer ~5m..~10m
#
# This implies that a staged full queue with 4 records typically does
# not take more than a minute, much less if enough peers are available
# while the penalty of potentially losing peers is a multiple of the
# queue ramp up time.
#
# So importing does not start before the queue is filled up.
if ctx.pool.stagedLenHwm <= ctx.blk.staged.len: if ctx.pool.stagedLenHwm <= ctx.blk.staged.len:
return true return true

View File

@ -53,6 +53,13 @@ proc bodiesFetch*(
error=($e.name), msg=e.msg, bdyErrors=buddy.bdyErrors error=($e.name), msg=e.msg, bdyErrors=buddy.bdyErrors
return err() return err()
# This round trip time `elapsed` is the real time, not necessarily the
# time relevant for network timeout which would throw an exception when
# the maximum response time has exceeded (typically set to 10s.)
#
# If the real round trip time `elapsed` is to long, the error score is
# inceased. Not until the error score will pass a certian threshold (for
# being too slow in consecutive conversations), the peer will be abandoned.
let elapsed = Moment.now() - start let elapsed = Moment.now() - start
# Evaluate result # Evaluate result

View File

@ -89,6 +89,13 @@ proc headersFetchReversed*(
hdrErrors=buddy.hdrErrors hdrErrors=buddy.hdrErrors
return err() return err()
# This round trip time `elapsed` is the real time, not necessarily the
# time relevant for network timeout which would throw an exception when
# the maximum response time has exceeded (typically set to 10s.)
#
# If the real round trip time `elapsed` is to long, the error score is
# inceased. Not until the error score will pass a certian threshold (for
# being too slow in consecutive conversations), the peer will be abandoned.
let elapsed = Moment.now() - start let elapsed = Moment.now() - start
# Evaluate result # Evaluate result

View File

@ -27,9 +27,9 @@ const
## seconds (rather than ms.) This will be further looked at to be confirmed ## seconds (rather than ms.) This will be further looked at to be confirmed
## or rejected as insignificant. ## or rejected as insignificant.
## ##
## FIXME: This setting has priority over the `maxPeers` setting of the ## Note:
## `BeaconSyncRef.init()` initaliser. This might be harmonised at ## This setting has priority over the `maxPeers` setting of the
## a later stage. ## `BeaconSyncRef.init()` initaliser.
# ---------------------- # ----------------------
@ -80,7 +80,7 @@ const
## Length of the request/stage batch. Several headers are consecutively ## Length of the request/stage batch. Several headers are consecutively
## fetched and stashed together as a single record on the staged queue. ## fetched and stashed together as a single record on the staged queue.
headersStagedQueueLengthLwm* = 32 headersStagedQueueLengthLwm* = 16
## Limit the number of records in the staged headers queue. ## Limit the number of records in the staged headers queue.
## ##
## Queue entries start accumulating if one peer stalls while fetching the ## Queue entries start accumulating if one peer stalls while fetching the
@ -91,7 +91,7 @@ const
## the above problem. Currently the **magic** is to let (pseudo) threads ## the above problem. Currently the **magic** is to let (pseudo) threads
## terminate and then restart all over again. ## terminate and then restart all over again.
headersStagedQueueLengthHwm* = 48 headersStagedQueueLengthHwm* = 24
## If this size is exceeded, the staged queue is flushed and resized to ## If this size is exceeded, the staged queue is flushed and resized to
## `headersStagedQueueLengthLwm-1` entries. Then contents is re-fetched ## `headersStagedQueueLengthLwm-1` entries. Then contents is re-fetched
## from scratch. ## from scratch.
@ -117,7 +117,7 @@ const
## With an average less than 90KiB/block (on `mainnet` at block ~#22m), ## With an average less than 90KiB/block (on `mainnet` at block ~#22m),
## one arrives at a total of at most 35MiB per block batch. ## one arrives at a total of at most 35MiB per block batch.
blocksStagedHwmDefault* = 4 * nFetchBodiesBatch blocksStagedHwmDefault* = 8 * nFetchBodiesBatch
## This is an initialiser value for `blocksStagedHwm`. ## This is an initialiser value for `blocksStagedHwm`.
## ##
## If the staged block queue exceeds this many number of block objects for ## If the staged block queue exceeds this many number of block objects for