Fix/clarify single mode for async sync scheduler (#1292)

why:
  Single mode here means there is only such (single mode) instance
  activated but multi mode instances for other peers are allowed.

  Erroneously, multi mode instances were held back waiting while some
  single mode instance was running which reduced the number of parallel
  download peers.
This commit is contained in:
Jordan Hrycaj 2022-11-09 19:16:25 +00:00 committed by GitHub
parent e14fd4b96c
commit 21837546c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 75 additions and 62 deletions

View File

@ -274,10 +274,10 @@ proc runSingle*(buddy: SnapBuddyRef) {.async.} =
## This peer worker is invoked if the peer-local flag `buddy.ctrl.multiOk` ## This peer worker is invoked if the peer-local flag `buddy.ctrl.multiOk`
## is set `false` which is the default mode. This flag is updated by the ## is set `false` which is the default mode. This flag is updated by the
## worker when deemed appropriate. ## worker when deemed appropriate.
## * For all workers, there can be only one `runSingle()` function active ## * For all worker peerss, there can be only one `runSingle()` function
## simultaneously for all worker peers. ## active simultaneously.
## * There will be no `runMulti()` function active for the same worker peer ## * There will be no `runMulti()` function active for the very same worker
## simultaneously ## peer that runs the `runSingle()` function.
## * There will be no `runPool()` iterator active simultaneously. ## * There will be no `runPool()` iterator active simultaneously.
## ##
## Note that this function runs in `async` mode. ## Note that this function runs in `async` mode.
@ -329,6 +329,8 @@ proc runPool*(buddy: SnapBuddyRef, last: bool) =
# FIXME: This check might not be needed. It will visit *every* node # FIXME: This check might not be needed. It will visit *every* node
# in the hexary trie for checking the account leaves. # in the hexary trie for checking the account leaves.
#
# Note: This is insane on main net
if buddy.checkAccountsTrieIsComplete(env): if buddy.checkAccountsTrieIsComplete(env):
env.accountsState = HealerDone env.accountsState = HealerDone

View File

@ -50,11 +50,11 @@
## This worker peer method is invoked if the peer-local flag ## This worker peer method is invoked if the peer-local flag
## `buddy.ctrl.multiOk` is set `false` which is the default mode. This flag ## `buddy.ctrl.multiOk` is set `false` which is the default mode. This flag
## is updated by the worker peer when deemed appropriate. ## is updated by the worker peer when deemed appropriate.
## + For all workers, there can be only one `runSingle()` function active ## + For all worker peerss, there can be only one `runSingle()` function
## simultaneously for all worker peers. ## active simultaneously.
## + There will be no `runMulti()` function active for the same worker peer ## + There will be no `runMulti()` function active for the very same worker
## simultaneously ## peer that runs the `runSingle()` function.
## + There will be no `runPool()` iterator active simultaneously. ## + There will be no `runPool()` iterator active.
## ##
## Note that this function runs in `async` mode. ## Note that this function runs in `async` mode.
## ##
@ -99,15 +99,28 @@ type
pool: PeerPool ## For starting the system pool: PeerPool ## For starting the system
buddies: ActiveBuddies[S,W] ## LRU cache with worker descriptors buddies: ActiveBuddies[S,W] ## LRU cache with worker descriptors
tickerOk: bool ## Ticker logger tickerOk: bool ## Ticker logger
singleRunLock: bool ## For worker initialisation singleRunLock: bool ## Some single mode runner is activated
monitorLock: bool ## For worker monitor monitorLock: bool ## Monitor mode is activated
activeMulti: int ## Activated runners activeMulti: int ## Number of activated runners in multi-mode
RunnerBuddyRef[S,W] = ref object RunnerBuddyRef[S,W] = ref object
## Per worker peer descriptor ## Per worker peer descriptor
dsc: RunnerSyncRef[S,W] ## Scheduler descriptor dsc: RunnerSyncRef[S,W] ## Scheduler descriptor
worker: BuddyRef[S,W] ## Worker peer data worker: BuddyRef[S,W] ## Worker peer data
const
execLoopTimeElapsedMin = 50.milliseconds
## Minimum elapsed time the event loop needs for a single lap. If it
## is faster, asynchroneous sleep seconds are added. in order to avoid
## cpu overload.
execLoopTaskSwitcher = 1.nanoseconds
## Asynchroneous waiting time at the end of the exec loop unless some sleep
## seconds were added as decribed by `execLoopTimeElapsedMin`, above.
execLoopPollingTime = 50.milliseconds
## Single asynchroneous time interval wait state for event polling
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Private helpers # Private helpers
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
@ -129,67 +142,65 @@ proc workerLoop[S,W](buddy: RunnerBuddyRef[S,W]) {.async.} =
peer = worker.peer peer = worker.peer
# Continue until stopped # Continue until stopped
while not worker.ctrl.stopped: block taskExecLoop:
if dsc.monitorLock: while worker.ctrl.running:
await sleepAsync(50.milliseconds) # Enforce minimum time spend on this loop
continue let startMoment = Moment.now()
# Invoke `runPool()` over all buddies if requested if dsc.monitorLock:
if ctx.poolMode: discard # suspend some time at the end of loop body
# Grab `monitorLock` (was `false` as checked above) and wait until clear
# to run as the only activated instance. # Invoke `runPool()` over all buddies if requested
dsc.monitorLock = true elif ctx.poolMode:
block poolModeExec: # Grab `monitorLock` (was `false` as checked above) and wait until
while 0 < dsc.activeMulti: # clear to run as the only activated instance.
await sleepAsync(50.milliseconds) dsc.monitorLock = true
while 0 < dsc.activeMulti or dsc.singleRunLock:
await sleepAsync execLoopPollingTime
if worker.ctrl.stopped: if worker.ctrl.stopped:
break poolModeExec dsc.monitorLock = false
while dsc.singleRunLock: break taskExecLoop
await sleepAsync(50.milliseconds)
if worker.ctrl.stopped:
break poolModeExec
var count = dsc.buddies.len var count = dsc.buddies.len
for w in dsc.buddies.nextValues: for w in dsc.buddies.nextValues:
count.dec count.dec
worker.runPool(count == 0) worker.runPool(count == 0)
# End `block poolModeExec` dsc.monitorLock = false
dsc.monitorLock = false
continue
# Rotate connection table so the most used entry is at the top/right else:
# end. So zombies will end up leftish. # Rotate connection table so the most used entry is at the top/right
discard dsc.buddies.lruFetch(peer.hash) # end. So zombies will end up leftish.
discard dsc.buddies.lruFetch(peer.hash)
# Allow task switch # Multi mode
await sleepAsync(1.milliseconds) if worker.ctrl.multiOk:
if worker.ctrl.stopped: if not dsc.singleRunLock:
break dsc.activeMulti.inc
# Continue doing something, work a bit
await worker.runMulti()
dsc.activeMulti.dec
# Multi mode elif dsc.singleRunLock:
if worker.ctrl.multiOk: # Some other process is running single mode
if not dsc.singleRunLock: discard # suspend some time at the end of loop body
dsc.activeMulti.inc
# Continue doing something, work a bit
await worker.runMulti()
dsc.activeMulti.dec
continue
# Single mode as requested. The `multiOk` flag for this worker was just else:
# found `false` in the pervious clause. # Start single instance mode by grabbing `singleRunLock` (was
if not dsc.singleRunLock: # `false` as checked above).
# Lock single instance mode and wait for other workers to finish dsc.singleRunLock = true
dsc.singleRunLock = true await worker.runSingle()
block singleModeExec: dsc.singleRunLock = false
while 0 < dsc.activeMulti:
await sleepAsync(50.milliseconds)
if worker.ctrl.stopped:
break singleModeExec
# Run single instance and release afterwards
await worker.runSingle()
# End `block singleModeExec`
dsc.singleRunLock = false
# End while if worker.ctrl.stopped:
break taskExecLoop
# Enforce minimum time spend on this loop so we never each 100% cpu load
# caused by some empty sub-tasks which are out of this scheduler control.
let
elapsed = Moment.now() - startMoment
suspend = if execLoopTimeElapsedMin <= elapsed: execLoopTaskSwitcher
else: execLoopTimeElapsedMin - elapsed
await sleepAsync suspend
# End while
# Note that `runStart()` was dispatched in `onPeerConnected()` # Note that `runStart()` was dispatched in `onPeerConnected()`
worker.runStop() worker.runStop()