Jordan Hrycaj bd42ebb193
Snap sync refactor accounts healing (#1392)
* Relocated mothballing (i.e. swap-in preparation) logic

details:
  Mothballing was previously tested & started after downloading
  account ranges in `range_fetch_accounts`.

  Whenever current download or healing stops because of a pivot change,
  swap-in preparation is needed (otherwise some storage slots may get
  lost when swap-in takes place.)

  Also, `execSnapSyncAction()` has been moved back to `pivot_helper`.

* Reorganised source file directories

details:
  Grouped pivot focused modules into `pivot` directory

* Renamed `checkNodes`, `sickSubTries` as `nodes.check`, `nodes.missing`

why:
  Both lists are typically used together as pair. Renaming `sickSubTries`
  reflects moving away from a healing centric view towards a swap-in
  attitude.

* Multi times coverage recording

details:
  Per pivot account ranges are accumulated into coverage range set. This
  set fill eventually contain a singe range of account hashes [0..2^256]
  which amounts to 100% capacity.

  A counter has been added that is incremented whenever max capacity is
  reached. The accumulated range is then reset to empty.

  The effect of this setting is that the coverage can be evenly duplicated.
  So 200% would not accumulate on a particular region.

* Update range length comparisons (mod 2^256)

why:
  A range interval can have sizes 1..2^256 as it cannot be empty by
  definition. The number of points in a range intervals set can have
  0..2^256 points. As the scalar range is a residue class modulo 2^256,
  the residue class 0 means length 2^256 for a range interval, but can
  be 0 or 2^256 for the number of points in a range intervals set.

* Generalised `hexaryEnvelopeDecompose()`

details:
  Compile the complement of the union of some (processed) intervals and
  express this complement as a list of envelopes of sub-tries.

  This facility is directly applicable to swap-in book-keeping.

* Re-factor `swapIn()`

why:
  Good idea but baloney implementation. The main algorithm is based on
  the generalised version of `hexaryEnvelopeDecompose()` which has been
  derived from this implementation.

* Refactor `healAccounts()` using `hexaryEnvelopeDecompose()` as main driver

why:
  Previously, the hexary trie was searched recursively for dangling nodes
  which has a poor worst case performance already when the trie  is
  reasonably populated.

  The function `hexaryEnvelopeDecompose()` is a magnitude faster because
  it does not peruse existing sub-tries in order to find missing nodes
  although result is not fully compatible with the previous function.

  So recursive search is used in a limited mode only when the decomposer
  will not deliver a useful result.

* Logging & maintenance fixes

details:
  Preparation for abandoning buddy-global healing variables `node`,
  `resumeCtx`, and `lockTriePerusal`. These variable are trie-perusal
  centric which will be run on the back burner in favour of
  `hexaryEnvelopeDecompose()` which is used for accounts healing already.
2022-12-19 21:22:09 +00:00

333 lines
10 KiB
Nim

# Nimbus
# Copyright (c) 2021 Status Research & Development GmbH
# Licensed under either of
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
# http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or
# http://opensource.org/licenses/MIT)
# at your option. This file may not be copied, modified, or distributed
# except according to those terms.
import
std/[hashes, options, sets, strutils],
chronicles,
chronos,
eth/[common, p2p],
stew/[interval_set, keyed_queue],
../../db/select_backend,
../../utils/prettify,
../misc/best_pivot,
".."/[protocol, sync_desc],
./worker/[pivot, ticker],
./worker/com/com_error,
./worker/db/[hexary_desc, snapdb_desc, snapdb_pivot],
"."/[constants, range_desc, worker_desc]
{.push raises: [Defect].}
logScope:
topics = "snap-buddy"
const
extraTraceMessages = false or true
## Enabled additional logging noise
# ------------------------------------------------------------------------------
# Private helpers: integration of pivot finder
# ------------------------------------------------------------------------------
proc pivot(ctx: SnapCtxRef): BestPivotCtxRef =
# Getter
ctx.data.pivotFinderCtx.BestPivotCtxRef
proc `pivot=`(ctx: SnapCtxRef; val: BestPivotCtxRef) =
# Setter
ctx.data.pivotFinderCtx = val
proc pivot(buddy: SnapBuddyRef): BestPivotWorkerRef =
# Getter
buddy.data.pivotFinder.BestPivotWorkerRef
proc `pivot=`(buddy: SnapBuddyRef; val: BestPivotWorkerRef) =
# Setter
buddy.data.pivotFinder = val
# ------------------------------------------------------------------------------
# Private functions
# ------------------------------------------------------------------------------
proc recoveryStepContinue(ctx: SnapCtxRef): Future[bool] {.async.} =
let recov = ctx.data.recovery
if recov.isNil:
return false
let
checkpoint =
"#" & $recov.state.header.blockNumber & "(" & $recov.level & ")"
topLevel = recov.level == 0
env = block:
let rc = ctx.data.pivotTable.eq recov.state.header.stateRoot
if rc.isErr:
error "Recovery pivot context gone", checkpoint, topLevel
return false
rc.value
# Cosmetics: allow other processes (e.g. ticker) to log the current recovery
# state. There is no other intended purpose of this wait state.
await sleepAsync 1100.milliseconds
#when extraTraceMessages:
# trace "Recovery continued ...", checkpoint, topLevel,
# nAccounts=recov.state.nAccounts, nDangling=recov.state.dangling.len
# Update pivot data from recovery checkpoint
env.recoverPivotFromCheckpoint(ctx, topLevel)
# Fetch next recovery record if there is any
if recov.state.predecessor.isZero:
#when extraTraceMessages:
# trace "Recovery done", checkpoint, topLevel
return false
let rc = ctx.data.snapDb.recoverPivot(recov.state.predecessor)
if rc.isErr:
when extraTraceMessages:
trace "Recovery stopped at pivot stale checkpoint", checkpoint, topLevel
return false
# Set up next level pivot checkpoint
ctx.data.recovery = SnapRecoveryRef(
state: rc.value,
level: recov.level + 1)
# Push onto pivot table and continue recovery (i.e. do not stop it yet)
ctx.data.pivotTable.update(
ctx.data.recovery.state.header, ctx, reverse=true)
return true # continue recovery
proc updateSinglePivot(buddy: SnapBuddyRef): Future[bool] {.async.} =
## Helper, negotiate pivot unless present
if buddy.pivot.pivotHeader.isOk:
return true
let
ctx = buddy.ctx
peer = buddy.peer
env = ctx.data.pivotTable.lastValue.get(otherwise = nil)
nMin = if env.isNil: none(BlockNumber)
else: some(env.stateHeader.blockNumber)
if await buddy.pivot.pivotNegotiate(nMin):
var header = buddy.pivot.pivotHeader.value
# Check whether there is no environment change needed
when pivotEnvStopChangingIfComplete:
let rc = ctx.data.pivotTable.lastValue
if rc.isOk and rc.value.storageDone:
# No neede to change
when extraTraceMessages:
trace "No need to change snap pivot", peer,
pivot=("#" & $rc.value.stateHeader.blockNumber),
stateRoot=rc.value.stateHeader.stateRoot,
multiOk=buddy.ctrl.multiOk, runState=buddy.ctrl.state
return true
buddy.ctx.data.pivotTable.update(header, buddy.ctx)
info "Snap pivot initialised", peer, pivot=("#" & $header.blockNumber),
multiOk=buddy.ctrl.multiOk, runState=buddy.ctrl.state
return true
# ------------------------------------------------------------------------------
# Public start/stop and admin functions
# ------------------------------------------------------------------------------
proc setup*(ctx: SnapCtxRef; tickerOK: bool): bool =
## Global set up
ctx.data.coveredAccounts = NodeTagRangeSet.init()
ctx.data.snapDb =
if ctx.data.dbBackend.isNil: SnapDbRef.init(ctx.chain.db.db)
else: SnapDbRef.init(ctx.data.dbBackend)
ctx.pivot = BestPivotCtxRef.init(ctx.data.rng)
ctx.pivot.pivotRelaxedMode(enable = true)
if tickerOK:
ctx.data.ticker = TickerRef.init(ctx.data.pivotTable.tickerStats(ctx))
else:
trace "Ticker is disabled"
# Check for recovery mode
if not ctx.data.noRecovery:
let rc = ctx.data.snapDb.recoverPivot()
if rc.isOk:
ctx.data.recovery = SnapRecoveryRef(state: rc.value)
ctx.daemon = true
# Set up early initial pivot
ctx.data.pivotTable.update(ctx.data.recovery.state.header, ctx)
trace "Recovery started",
checkpoint=("#" & $ctx.data.pivotTable.topNumber() & "(0)")
if not ctx.data.ticker.isNil:
ctx.data.ticker.startRecovery()
true
proc release*(ctx: SnapCtxRef) =
## Global clean up
ctx.pivot = nil
if not ctx.data.ticker.isNil:
ctx.data.ticker.stop()
ctx.data.ticker = nil
proc start*(buddy: SnapBuddyRef): bool =
## Initialise worker peer
let
ctx = buddy.ctx
peer = buddy.peer
if peer.supports(protocol.snap) and
peer.supports(protocol.eth) and
peer.state(protocol.eth).initialized:
buddy.pivot = BestPivotWorkerRef.init(
buddy.ctx.pivot, buddy.ctrl, buddy.peer)
buddy.data.errors = ComErrorStatsRef()
if not ctx.data.ticker.isNil:
ctx.data.ticker.startBuddy()
return true
proc stop*(buddy: SnapBuddyRef) =
## Clean up this peer
let
ctx = buddy.ctx
peer = buddy.peer
buddy.ctrl.stopped = true
buddy.pivot.clear()
if not ctx.data.ticker.isNil:
ctx.data.ticker.stopBuddy()
# ------------------------------------------------------------------------------
# Public functions
# ------------------------------------------------------------------------------
proc runDaemon*(ctx: SnapCtxRef) {.async.} =
## Enabled while `ctx.daemon` is `true`
##
if not ctx.data.recovery.isNil:
if not await ctx.recoveryStepContinue():
# Done, stop recovery
ctx.data.recovery = nil
ctx.daemon = false
# Update logging
if not ctx.data.ticker.isNil:
ctx.data.ticker.stopRecovery()
return
proc runSingle*(buddy: SnapBuddyRef) {.async.} =
## Enabled while
## * `buddy.ctrl.multiOk` is `false`
## * `buddy.ctrl.poolMode` is `false`
##
let peer = buddy.peer
# Find pivot, probably relaxed mode enabled in `setup()`
if not await buddy.updateSinglePivot():
# Wait if needed, then return => repeat
if not buddy.ctrl.stopped:
await sleepAsync(2.seconds)
return
buddy.ctrl.multiOk = true
proc runPool*(buddy: SnapBuddyRef, last: bool): bool =
## Enabled when `buddy.ctrl.poolMode` is `true`
##
let ctx = buddy.ctx
ctx.poolMode = false
result = true
proc runMulti*(buddy: SnapBuddyRef) {.async.} =
## Enabled while
## * `buddy.ctx.multiOk` is `true`
## * `buddy.ctx.poolMode` is `false`
##
let
ctx = buddy.ctx
peer = buddy.peer
# Set up current state root environment for accounts snapshot
let
env = block:
let rc = ctx.data.pivotTable.lastValue
if rc.isErr:
return # nothing to do
rc.value
pivot = "#" & $env.stateHeader.blockNumber # for logging
buddy.data.pivotEnv = env
# Full sync processsing based on current snapshot
# -----------------------------------------------
if env.storageDone:
trace "Snap full sync -- not implemented yet", peer, pivot
await sleepAsync(5.seconds)
return
# Snapshot sync processing
# ------------------------
# If this is a new pivot, the previous one can be cleaned up. There is no
# point in keeping some older space consuming state data any longer.
ctx.data.pivotTable.beforeTopMostlyClean()
when extraTraceMessages:
block:
let
nAccounts = env.nAccounts
nSlotLists = env.nSlotLists
processed = env.fetchAccounts.processed.fullFactor.toPC(2)
nStoQu = env.fetchStorageFull.len + env.fetchStoragePart.len
accHealThresh = env.healThresh.toPC(2)
trace "Multi sync runner", peer, pivot, nAccounts, nSlotLists, processed,
nStoQu, accHealThresh
# This one is the syncing work horse which downloads the database
await env.execSnapSyncAction(buddy)
if env.archived:
let
peer = buddy.peer
nAccounts = env.nAccounts
nSlotLists = env.nSlotLists
when extraTraceMessages:
trace "Mothballing", peer, pivot=("#" & $env.stateHeader.blockNumber),
nAccounts=env.nAccounts, nSlotLists=env.nSlotLists
env.pivotMothball()
return # pivot has changed
block:
# Save state so sync can be partially resumed at next start up
let
nAccounts = env.nAccounts
nSlotLists = env.nSlotLists
processed = env.fetchAccounts.processed.fullFactor.toPC(2)
nStoQu = env.fetchStorageFull.len + env.fetchStoragePart.len
accHealThresh = env.healThresh.toPC(2)
rc = env.saveCheckpoint(ctx)
if rc.isErr:
error "Failed to save recovery checkpoint", peer, pivot, nAccounts,
nSlotLists, processed, nStoQu, error=rc.error
else:
when extraTraceMessages:
trace "Saved recovery checkpoint", peer, pivot, nAccounts, nSlotLists,
processed, nStoQu, blobSize=rc.value, accHealThresh
if buddy.ctrl.stopped:
return # peer worker has gone
# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------