From 21837546c335c43f589144b76697dca3231a189c Mon Sep 17 00:00:00 2001 From: Jordan Hrycaj Date: Wed, 9 Nov 2022 19:16:25 +0000 Subject: [PATCH] Fix/clarify single mode for async sync scheduler (#1292) why: Single mode here means there is only such (single mode) instance activated but multi mode instances for other peers are allowed. Erroneously, multi mode instances were held back waiting while some single mode instance was running which reduced the number of parallel download peers. --- nimbus/sync/snap/worker.nim | 10 +-- nimbus/sync/sync_sched.nim | 127 ++++++++++++++++++++---------------- 2 files changed, 75 insertions(+), 62 deletions(-) diff --git a/nimbus/sync/snap/worker.nim b/nimbus/sync/snap/worker.nim index e0ac9709a..84d9748fa 100644 --- a/nimbus/sync/snap/worker.nim +++ b/nimbus/sync/snap/worker.nim @@ -274,10 +274,10 @@ proc runSingle*(buddy: SnapBuddyRef) {.async.} = ## This peer worker is invoked if the peer-local flag `buddy.ctrl.multiOk` ## is set `false` which is the default mode. This flag is updated by the ## worker when deemed appropriate. - ## * For all workers, there can be only one `runSingle()` function active - ## simultaneously for all worker peers. - ## * There will be no `runMulti()` function active for the same worker peer - ## simultaneously + ## * For all worker peerss, there can be only one `runSingle()` function + ## active simultaneously. + ## * There will be no `runMulti()` function active for the very same worker + ## peer that runs the `runSingle()` function. ## * There will be no `runPool()` iterator active simultaneously. ## ## Note that this function runs in `async` mode. @@ -329,6 +329,8 @@ proc runPool*(buddy: SnapBuddyRef, last: bool) = # FIXME: This check might not be needed. It will visit *every* node # in the hexary trie for checking the account leaves. + # + # Note: This is insane on main net if buddy.checkAccountsTrieIsComplete(env): env.accountsState = HealerDone diff --git a/nimbus/sync/sync_sched.nim b/nimbus/sync/sync_sched.nim index 106593987..00de6701d 100644 --- a/nimbus/sync/sync_sched.nim +++ b/nimbus/sync/sync_sched.nim @@ -50,11 +50,11 @@ ## This worker peer method is invoked if the peer-local flag ## `buddy.ctrl.multiOk` is set `false` which is the default mode. This flag ## is updated by the worker peer when deemed appropriate. -## + For all workers, there can be only one `runSingle()` function active -## simultaneously for all worker peers. -## + There will be no `runMulti()` function active for the same worker peer -## simultaneously -## + There will be no `runPool()` iterator active simultaneously. +## + For all worker peerss, there can be only one `runSingle()` function +## active simultaneously. +## + There will be no `runMulti()` function active for the very same worker +## peer that runs the `runSingle()` function. +## + There will be no `runPool()` iterator active. ## ## Note that this function runs in `async` mode. ## @@ -99,15 +99,28 @@ type pool: PeerPool ## For starting the system buddies: ActiveBuddies[S,W] ## LRU cache with worker descriptors tickerOk: bool ## Ticker logger - singleRunLock: bool ## For worker initialisation - monitorLock: bool ## For worker monitor - activeMulti: int ## Activated runners + singleRunLock: bool ## Some single mode runner is activated + monitorLock: bool ## Monitor mode is activated + activeMulti: int ## Number of activated runners in multi-mode RunnerBuddyRef[S,W] = ref object ## Per worker peer descriptor dsc: RunnerSyncRef[S,W] ## Scheduler descriptor worker: BuddyRef[S,W] ## Worker peer data +const + execLoopTimeElapsedMin = 50.milliseconds + ## Minimum elapsed time the event loop needs for a single lap. If it + ## is faster, asynchroneous sleep seconds are added. in order to avoid + ## cpu overload. + + execLoopTaskSwitcher = 1.nanoseconds + ## Asynchroneous waiting time at the end of the exec loop unless some sleep + ## seconds were added as decribed by `execLoopTimeElapsedMin`, above. + + execLoopPollingTime = 50.milliseconds + ## Single asynchroneous time interval wait state for event polling + # ------------------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------------------ @@ -129,67 +142,65 @@ proc workerLoop[S,W](buddy: RunnerBuddyRef[S,W]) {.async.} = peer = worker.peer # Continue until stopped - while not worker.ctrl.stopped: - if dsc.monitorLock: - await sleepAsync(50.milliseconds) - continue + block taskExecLoop: + while worker.ctrl.running: + # Enforce minimum time spend on this loop + let startMoment = Moment.now() - # Invoke `runPool()` over all buddies if requested - if ctx.poolMode: - # Grab `monitorLock` (was `false` as checked above) and wait until clear - # to run as the only activated instance. - dsc.monitorLock = true - block poolModeExec: - while 0 < dsc.activeMulti: - await sleepAsync(50.milliseconds) + if dsc.monitorLock: + discard # suspend some time at the end of loop body + + # Invoke `runPool()` over all buddies if requested + elif ctx.poolMode: + # Grab `monitorLock` (was `false` as checked above) and wait until + # clear to run as the only activated instance. + dsc.monitorLock = true + while 0 < dsc.activeMulti or dsc.singleRunLock: + await sleepAsync execLoopPollingTime if worker.ctrl.stopped: - break poolModeExec - while dsc.singleRunLock: - await sleepAsync(50.milliseconds) - if worker.ctrl.stopped: - break poolModeExec + dsc.monitorLock = false + break taskExecLoop var count = dsc.buddies.len for w in dsc.buddies.nextValues: count.dec worker.runPool(count == 0) - # End `block poolModeExec` - dsc.monitorLock = false - continue + dsc.monitorLock = false - # Rotate connection table so the most used entry is at the top/right - # end. So zombies will end up leftish. - discard dsc.buddies.lruFetch(peer.hash) + else: + # Rotate connection table so the most used entry is at the top/right + # end. So zombies will end up leftish. + discard dsc.buddies.lruFetch(peer.hash) - # Allow task switch - await sleepAsync(1.milliseconds) - if worker.ctrl.stopped: - break + # Multi mode + if worker.ctrl.multiOk: + if not dsc.singleRunLock: + dsc.activeMulti.inc + # Continue doing something, work a bit + await worker.runMulti() + dsc.activeMulti.dec - # Multi mode - if worker.ctrl.multiOk: - if not dsc.singleRunLock: - dsc.activeMulti.inc - # Continue doing something, work a bit - await worker.runMulti() - dsc.activeMulti.dec - continue + elif dsc.singleRunLock: + # Some other process is running single mode + discard # suspend some time at the end of loop body - # Single mode as requested. The `multiOk` flag for this worker was just - # found `false` in the pervious clause. - if not dsc.singleRunLock: - # Lock single instance mode and wait for other workers to finish - dsc.singleRunLock = true - block singleModeExec: - while 0 < dsc.activeMulti: - await sleepAsync(50.milliseconds) - if worker.ctrl.stopped: - break singleModeExec - # Run single instance and release afterwards - await worker.runSingle() - # End `block singleModeExec` - dsc.singleRunLock = false + else: + # Start single instance mode by grabbing `singleRunLock` (was + # `false` as checked above). + dsc.singleRunLock = true + await worker.runSingle() + dsc.singleRunLock = false - # End while + if worker.ctrl.stopped: + break taskExecLoop + + # Enforce minimum time spend on this loop so we never each 100% cpu load + # caused by some empty sub-tasks which are out of this scheduler control. + let + elapsed = Moment.now() - startMoment + suspend = if execLoopTimeElapsedMin <= elapsed: execLoopTaskSwitcher + else: execLoopTimeElapsedMin - elapsed + await sleepAsync suspend + # End while # Note that `runStart()` was dispatched in `onPeerConnected()` worker.runStop()