diff --git a/execution_chain/sync/beacon/Grafana-example.json b/execution_chain/sync/beacon/Grafana-example.json index bebd472c6..dbb594f3e 100644 --- a/execution_chain/sync/beacon/Grafana-example.json +++ b/execution_chain/sync/beacon/Grafana-example.json @@ -134,25 +134,8 @@ "sort": "none" } }, - "pluginVersion": "11.5.1", + "pluginVersion": "11.5.2", "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "bdvimuocv3fggc" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "nec_execution_head{instance=\"172.16.210.1:9099\", job=\"mainnet\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "nec_execution_head", - "range": true, - "refId": "A", - "useBackend": false - }, { "datasource": { "type": "prometheus", @@ -169,6 +152,23 @@ "range": true, "refId": "B", "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdvimuocv3fggc" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "nec_execution_head{instance=\"172.16.210.1:9099\", job=\"mainnet\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "nec_execution_head", + "range": true, + "refId": "A", + "useBackend": false } ], "title": "Base vs Latest", @@ -194,7 +194,7 @@ "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 25, + "fillOpacity": 50, "gradientMode": "opacity", "hideFrom": { "legend": false, @@ -244,7 +244,7 @@ { "id": "color", "value": { - "fixedColor": "dark-blue", + "fixedColor": "super-light-blue", "mode": "fixed" } } @@ -287,8 +287,26 @@ "sort": "none" } }, - "pluginVersion": "11.5.1", + "pluginVersion": "11.5.2", "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdvimuocv3fggc" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "nec_sync_peers{instance=\"172.16.210.1:9099\", job=\"mainnet\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "nec_sync_peers", + "range": true, + "refId": "B", + "useBackend": false + }, { "datasource": { "type": "prometheus", @@ -307,24 +325,6 @@ "range": true, "refId": "A", "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "bdvimuocv3fggc" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "nec_sync_peers{instance=\"172.16.210.1:9099\", job=\"mainnet\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "nec_sync_peers", - "range": true, - "refId": "B", - "useBackend": false } ], "title": "Buddies vs Import Block Lists", @@ -350,8 +350,8 @@ "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 27, - "gradientMode": "none", + "fillOpacity": 50, + "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, @@ -443,7 +443,7 @@ "sort": "none" } }, - "pluginVersion": "11.5.1", + "pluginVersion": "11.5.2", "targets": [ { "datasource": { @@ -625,7 +625,7 @@ "sort": "none" } }, - "pluginVersion": "11.5.1", + "pluginVersion": "11.5.2", "targets": [ { "datasource": { @@ -761,7 +761,7 @@ "sort": "none" } }, - "pluginVersion": "11.5.1", + "pluginVersion": "11.5.2", "targets": [ { "datasource": { @@ -913,7 +913,7 @@ "sort": "none" } }, - "pluginVersion": "11.5.1", + "pluginVersion": "11.5.2", "targets": [ { "datasource": { @@ -946,13 +946,13 @@ "list": [] }, "time": { - "from": "now-1h", + "from": "2025-02-25T11:03:15.000Z", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "Mainnet Sync", "uid": "beavm2j26dvr4b", - "version": 45, + "version": 49, "weekStart": "" } \ No newline at end of file diff --git a/execution_chain/sync/beacon/README.md b/execution_chain/sync/beacon/README.md index b2b28571b..f57a94474 100644 --- a/execution_chain/sync/beacon/README.md +++ b/execution_chain/sync/beacon/README.md @@ -283,7 +283,7 @@ For the *Era1*/*Era* pre-load (if any) the following extra options apply to To start syncing, the following additional options apply to *nimbus*: - --debug-beacon-blocks-queue-hwm=1000 + --debug-beacon-blocks-queue-hwm=1500 --debug-rocksdb-max-open-files=384 --debug-rocksdb-write-buffer-size=50331648 --debug-rocksdb-block-cache-size=1073741824 diff --git a/execution_chain/sync/beacon/worker.nim b/execution_chain/sync/beacon/worker.nim index 64921b2a3..ba4d2acb0 100644 --- a/execution_chain/sync/beacon/worker.nim +++ b/execution_chain/sync/beacon/worker.nim @@ -118,10 +118,11 @@ proc runDaemon*( if ctx.blocksStagedCanImportOk(): block: - # Set advisory flag telling that a slow/long running process will take - # place. So there might be some peers active. If they are waiting for - # a message reply, this will most probably time out as all processing - # power is usurped by the import task here. + # Set flag informing peers to go into idle mode while importing takes + # place. It has been observed that importing blocks and downloading + # at the same time does not work very well, most probably due to high + # system activity while importing. Peers will get lost pretty soon after + # downloading starts if they continue downloading. ctx.pool.blockImportOk = true defer: ctx.pool.blockImportOk = false diff --git a/execution_chain/sync/beacon/worker/blocks_staged.nim b/execution_chain/sync/beacon/worker/blocks_staged.nim index ddf09c250..225914e60 100644 --- a/execution_chain/sync/beacon/worker/blocks_staged.nim +++ b/execution_chain/sync/beacon/worker/blocks_staged.nim @@ -147,7 +147,21 @@ func blocksStagedCanImportOk*(ctx: BeaconCtxRef): bool = if ctx.pool.nBuddies == 0: return true - # Start importing if the queue is filled up enough + # If importing starts while peers are actively downloading, the system + # tends to loose download peers, most probably due to high system + # activity. + # + # * Typical download time to download and stage a queue record ~15s (raw + # download time typically ranges ~30ms ..~10s) + # + # * Anecdotal time to connect to a new download peer ~5m..~10m + # + # This implies that a staged full queue with 4 records typically does + # not take more than a minute, much less if enough peers are available + # while the penalty of potentially losing peers is a multiple of the + # queue ramp up time. + # + # So importing does not start before the queue is filled up. if ctx.pool.stagedLenHwm <= ctx.blk.staged.len: return true diff --git a/execution_chain/sync/beacon/worker/blocks_staged/bodies.nim b/execution_chain/sync/beacon/worker/blocks_staged/bodies.nim index d7aa187a7..338da1515 100644 --- a/execution_chain/sync/beacon/worker/blocks_staged/bodies.nim +++ b/execution_chain/sync/beacon/worker/blocks_staged/bodies.nim @@ -53,6 +53,13 @@ proc bodiesFetch*( error=($e.name), msg=e.msg, bdyErrors=buddy.bdyErrors return err() + # This round trip time `elapsed` is the real time, not necessarily the + # time relevant for network timeout which would throw an exception when + # the maximum response time has exceeded (typically set to 10s.) + # + # If the real round trip time `elapsed` is to long, the error score is + # inceased. Not until the error score will pass a certian threshold (for + # being too slow in consecutive conversations), the peer will be abandoned. let elapsed = Moment.now() - start # Evaluate result diff --git a/execution_chain/sync/beacon/worker/headers_staged/headers.nim b/execution_chain/sync/beacon/worker/headers_staged/headers.nim index 82f8d9845..074cc88f3 100644 --- a/execution_chain/sync/beacon/worker/headers_staged/headers.nim +++ b/execution_chain/sync/beacon/worker/headers_staged/headers.nim @@ -89,6 +89,13 @@ proc headersFetchReversed*( hdrErrors=buddy.hdrErrors return err() + # This round trip time `elapsed` is the real time, not necessarily the + # time relevant for network timeout which would throw an exception when + # the maximum response time has exceeded (typically set to 10s.) + # + # If the real round trip time `elapsed` is to long, the error score is + # inceased. Not until the error score will pass a certian threshold (for + # being too slow in consecutive conversations), the peer will be abandoned. let elapsed = Moment.now() - start # Evaluate result diff --git a/execution_chain/sync/beacon/worker_config.nim b/execution_chain/sync/beacon/worker_config.nim index a81d95ba5..a1ba54612 100644 --- a/execution_chain/sync/beacon/worker_config.nim +++ b/execution_chain/sync/beacon/worker_config.nim @@ -27,9 +27,9 @@ const ## seconds (rather than ms.) This will be further looked at to be confirmed ## or rejected as insignificant. ## - ## FIXME: This setting has priority over the `maxPeers` setting of the - ## `BeaconSyncRef.init()` initaliser. This might be harmonised at - ## a later stage. + ## Note: + ## This setting has priority over the `maxPeers` setting of the + ## `BeaconSyncRef.init()` initaliser. # ---------------------- @@ -80,7 +80,7 @@ const ## Length of the request/stage batch. Several headers are consecutively ## fetched and stashed together as a single record on the staged queue. - headersStagedQueueLengthLwm* = 32 + headersStagedQueueLengthLwm* = 16 ## Limit the number of records in the staged headers queue. ## ## Queue entries start accumulating if one peer stalls while fetching the @@ -91,7 +91,7 @@ const ## the above problem. Currently the **magic** is to let (pseudo) threads ## terminate and then restart all over again. - headersStagedQueueLengthHwm* = 48 + headersStagedQueueLengthHwm* = 24 ## If this size is exceeded, the staged queue is flushed and resized to ## `headersStagedQueueLengthLwm-1` entries. Then contents is re-fetched ## from scratch. @@ -117,7 +117,7 @@ const ## With an average less than 90KiB/block (on `mainnet` at block ~#22m), ## one arrives at a total of at most 35MiB per block batch. - blocksStagedHwmDefault* = 4 * nFetchBodiesBatch + blocksStagedHwmDefault* = 8 * nFetchBodiesBatch ## This is an initialiser value for `blocksStagedHwm`. ## ## If the staged block queue exceeds this many number of block objects for