Jordan Hrycaj 430611d3bc
Beacon sync updates tbc (#2818)
* Clear rejected sync target so that it would not be processed again

* Use in-memory table to stash headers after FCU import has started

why:
  After block imported has started, there is no way to save/stash block
  headers persistently. The FCU handlers always maintain a positive
  transaction level and in some instances the current transaction is
  flushed and re-opened.

  This patch fixes an exception thrown when a block header has gone
  missing.

* When resuming sync, delete stale headers and state

why:
  Deleting headers saves some persistent space that would get lost
  otherwise. Deleting the state after resuming prevents from race
  conditions.

* On clean start hibernate sync `deamon` entity before first update from CL

details:
  Only reduces services are running
  * accept FCU from CL
  * fetch finalised header after accepting FCY (provides hash only)

* Improve text/meaning of some log messages

* Revisit error handling for useless peers

why:
  A peer is abandoned from if the error score is too high. This was not
  properly handled for some fringe case when the error was detected at
  staging time but fetching via eth/xx was ok.

* Clarify `break` meaning by using labelled `break` statements

* Fix action how to commit when sync target has been reached

why:
  The sync target block number might precede than latest FCU block number.
  This happens when the engine API squeezes in some request to execute
  and import subsequent blocks.

  This patch fixes and assert thrown when after reaching target the latest
  FCU block number is higher than the expected target block number.

* Update TODO list
2024-11-01 19:18:41 +00:00

231 lines
8.8 KiB
Nim

# Nimbus
# Copyright (c) 2023-2024 Status Research & Development GmbH
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at
# https://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at
# https://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed
# except according to those terms.
{.push raises:[].}
import
pkg/[chronicles, chronos],
pkg/eth/[common, p2p],
pkg/stew/[interval_set, sorted_set],
../../common,
./worker/[blocks_staged, headers_staged, headers_unproc, start_stop, update],
./worker_desc
# ------------------------------------------------------------------------------
# Private functions
# ------------------------------------------------------------------------------
proc headersToFetchOk(buddy: BeaconBuddyRef): bool =
0 < buddy.ctx.headersUnprocTotal() and
buddy.ctrl.running and
not buddy.ctx.poolMode
proc bodiesToFetchOk(buddy: BeaconBuddyRef): bool =
buddy.ctx.blocksStagedFetchOk() and
buddy.ctrl.running and
not buddy.ctx.poolMode
proc napUnlessSomethingToFetch(buddy: BeaconBuddyRef): Future[bool] {.async.} =
## When idle, save cpu cycles waiting for something to do.
if buddy.ctx.pool.blockImportOk or # currently importing blocks
buddy.ctx.hibernate or # not activated yet?
not (buddy.headersToFetchOk() or # something on TODO list
buddy.bodiesToFetchOk()):
await sleepAsync workerIdleWaitInterval
return true
else:
return false
# ------------------------------------------------------------------------------
# Public start/stop and admin functions
# ------------------------------------------------------------------------------
proc setup*(ctx: BeaconCtxRef; info: static[string]): bool =
## Global set up
ctx.setupRpcMagic()
# Load initial state from database if there is any
ctx.setupDatabase info
# Debugging stuff, might be an empty template
ctx.setupTicker()
true
proc release*(ctx: BeaconCtxRef; info: static[string]) =
## Global clean up
ctx.destroyRpcMagic()
ctx.destroyTicker()
proc start*(buddy: BeaconBuddyRef; info: static[string]): bool =
## Initialise worker peer
let peer = buddy.peer
if runsThisManyPeersOnly <= buddy.ctx.pool.nBuddies:
if not buddy.ctx.hibernate: debug info & ": peers limit reached", peer
return false
if not buddy.startBuddy():
if not buddy.ctx.hibernate: debug info & ": failed", peer
return false
if not buddy.ctx.hibernate: debug info & ": new peer", peer
true
proc stop*(buddy: BeaconBuddyRef; info: static[string]) =
## Clean up this peer
if not buddy.ctx.hibernate: debug info & ": release peer", peer=buddy.peer,
ctrl=buddy.ctrl.state, nInvocations=buddy.only.nMultiLoop,
lastIdleGap=buddy.only.multiRunIdle.toStr
buddy.stopBuddy()
# ------------------------------------------------------------------------------
# Public functions
# ------------------------------------------------------------------------------
proc runDaemon*(ctx: BeaconCtxRef; info: static[string]) {.async.} =
## Global background job that will be re-started as long as the variable
## `ctx.daemon` is set `true`. If that job was stopped due to re-setting
## `ctx.daemon` to `false`, it will be restarted next after it was reset
## as `true` not before there is some activity on the `runPool()`,
## `runSingle()`, or `runMulti()` functions.
##
## On a fresh start, the flag `ctx.daemon` will not be set `true` before the
## first usable request from the CL (via RPC) stumbles in.
##
# Check for a possible header layout and body request changes
ctx.updateSyncStateLayout info
if ctx.hibernate:
return
ctx.updateBlockRequests info
# Execute staged block records.
if ctx.blocksStagedCanImportOk():
block:
# Set advisory flag telling that a slow/long running process will take
# place. So there might be some peers active. If they are waiting for
# a message reply, this will most probably time out as all processing
# power is usurped by the import task here.
ctx.pool.blockImportOk = true
defer: ctx.pool.blockImportOk = false
# Import from staged queue.
while await ctx.blocksStagedImport(info):
if not ctx.daemon:
# Implied by external sync shutdown?
return
# At the end of the cycle, leave time to trigger refill headers/blocks
await sleepAsync daemonWaitInterval
proc runPool*(
buddy: BeaconBuddyRef;
last: bool;
laps: int;
info: static[string];
): bool =
## Once started, the function `runPool()` is called for all worker peers in
## sequence as long as this function returns `false`. There will be no other
## `runPeer()` functions activated while `runPool()` is active.
##
## This procedure is started if the global flag `buddy.ctx.poolMode` is set
## `true` (default is `false`.) The flag will be automatically reset before
## the loop starts. Re-setting it again results in repeating the loop. The
## argument `laps` (starting with `0`) indicated the currend lap of the
## repeated loops.
##
## The argument `last` is set `true` if the last entry is reached.
##
## Note that this function does not run in `async` mode.
##
buddy.ctx.headersStagedReorg info
true # stop
proc runPeer*(buddy: BeaconBuddyRef; info: static[string]) {.async.} =
## This peer worker method is repeatedly invoked (exactly one per peer) while
## the `buddy.ctrl.poolMode` flag is set `false`.
##
let peer = buddy.peer
if 0 < buddy.only.nMultiLoop: # statistics/debugging
buddy.only.multiRunIdle = Moment.now() - buddy.only.stoppedMultiRun
buddy.only.nMultiLoop.inc # statistics/debugging
# Update consensus header target when needed. It comes with a finalised
# header hash where we need to complete the block number.
await buddy.headerStagedUpdateTarget info
if not await buddy.napUnlessSomethingToFetch():
#
# Layout of a triple of linked header chains (see `README.md`)
# ::
# 0 C D H
# | <--- [0,C] --> | <----- (C,D) -----> | <-- [D,H] ---> |
# o----------------o---------------------o----------------o--->
# | <-- linked --> | <-- unprocessed --> | <-- linked --> |
#
# This function is run concurrently for fetching the next batch of
# headers and stashing them on the database. Each concurrently running
# actor works as follows:
#
# * Get a range of block numbers from the `unprocessed` range `(C,D)`.
# * Fetch headers for this range (as much as one can get).
# * Stash then on the database.
# * Rinse and repeat.
#
# The block numbers range concurrently taken from `(C,D)` are chosen
# from the upper range. So exactly one of the actors has a range
# `[whatever,D-1]` adjacent to `[D,H]`. Call this actor the lead actor.
#
# For the lead actor, headers can be downloaded all by the hashes as
# the parent hash for the header with block number `D` is known. All
# other non-lead actors will download headers by the block number only
# and stage it to be re-ordered and stashed on the database when ready.
#
# Once the lead actor stashes the dowloaded headers, the other staged
# headers will also be stashed on the database until there is a gap or
# the stashed haeders are exhausted.
#
# Due to the nature of the `async` logic, the current lead actor will
# stay lead when fetching the next range of block numbers.
#
while buddy.headersToFetchOk():
# * Get unprocessed range from pool
# * Fetch headers for this range (as much as one can get)
# * Verify that a block is contiguous, chained by parent hash, etc.
# * Stash this range on the staged queue on the pool
if await buddy.headersStagedCollect info:
# * Save updated state and headers
# * Decrease the dangling left boundary `D` of the trusted range `[D,H]`
discard buddy.ctx.headersStagedProcess info
# Fetch bodies and combine them with headers to blocks to be staged. These
# staged blocks are then excuted by the daemon process (no `peer` needed.)
while buddy.bodiesToFetchOk():
discard await buddy.blocksStagedCollect info
# Note that it is important **not** to leave this function to be
# re-invoked by the scheduler unless necessary. While the time gap
# until restarting is typically a few millisecs, there are always
# outliers which well exceed several seconds. This seems to let
# remote peers run into timeouts.
buddy.only.stoppedMultiRun = Moment.now() # statistics/debugging
# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------