Fix/clarify single mode for async sync scheduler (#1292)

why: Single mode here means there is only such (single mode) instance activated but multi mode instances for other peers are allowed. Erroneously, multi mode instances were held back waiting while some single mode instance was running which reduced the number of parallel download peers.
2022-11-09 19:16:25 +00:00 · 2022-11-09 19:16:25 +00:00 · 21837546c3
parent e14fd4b96c
commit 21837546c3
2 changed files with 75 additions and 62 deletions
--- a/nimbus/sync/snap/worker.nim
+++ b/nimbus/sync/snap/worker.nim
@ -274,10 +274,10 @@ proc runSingle*(buddy: SnapBuddyRef) {.async.} =
  ## This peer worker is invoked if the peer-local flag `buddy.ctrl.multiOk`
  ## is set `false` which is the default mode. This flag is updated by the
  ## worker when deemed appropriate.
-  ## * For all workers, there can be only one `runSingle()` function active
+  ## * For all worker peerss, there can be only one `runSingle()` function
-  ##   simultaneously for all worker peers.
+  ##   active simultaneously.
-  ## * There will be no `runMulti()` function active for the same worker peer
+  ## * There will be no `runMulti()` function active for the very same worker
-  ##   simultaneously
+  ##   peer that runs the `runSingle()` function.
  ## * There will be no `runPool()` iterator active simultaneously.
  ##
  ## Note that this function runs in `async` mode.
@ -329,6 +329,8 @@ proc runPool*(buddy: SnapBuddyRef, last: bool) =
          # FIXME: This check might not be needed. It will visit *every* node
          #        in the hexary trie for checking the account leaves.
          #
          #        Note: This is insane on main net
          if buddy.checkAccountsTrieIsComplete(env):
            env.accountsState = HealerDone
--- a/nimbus/sync/sync_sched.nim
+++ b/nimbus/sync/sync_sched.nim
@ -50,11 +50,11 @@
 ##   This worker peer method is invoked if the peer-local flag
 ##   `buddy.ctrl.multiOk` is set `false` which is the default mode. This flag
 ##   is updated by the worker peer when deemed appropriate.
-##   + For all workers, there can be only one `runSingle()` function active
+##   + For all worker peerss, there can be only one `runSingle()` function
-##     simultaneously for all worker peers.
+##     active simultaneously.
-##   + There will be no `runMulti()` function active for the same worker peer
+##   + There will be no `runMulti()` function active for the very same worker
-##     simultaneously
+##     peer that runs the `runSingle()` function.
-##   + There will be no `runPool()` iterator active simultaneously.
+##   + There will be no `runPool()` iterator active.
 ##
 ##   Note that this function runs in `async` mode.
 ##
@ -99,15 +99,28 @@ type
    pool: PeerPool              ## For starting the system
    buddies: ActiveBuddies[S,W] ## LRU cache with worker descriptors
    tickerOk: bool              ## Ticker logger
-    singleRunLock: bool         ## For worker initialisation
+    singleRunLock: bool         ## Some single mode runner is activated
-    monitorLock: bool           ## For worker monitor
+    monitorLock: bool           ## Monitor mode is activated
-    activeMulti: int            ## Activated runners
+    activeMulti: int            ## Number of activated runners in multi-mode
  RunnerBuddyRef[S,W] = ref object
    ## Per worker peer descriptor
    dsc: RunnerSyncRef[S,W]     ## Scheduler descriptor
    worker: BuddyRef[S,W]       ## Worker peer data
 const
  execLoopTimeElapsedMin = 50.milliseconds
    ## Minimum elapsed time the event loop needs for a single lap. If it
    ## is faster, asynchroneous sleep seconds are added. in order to avoid
    ## cpu overload.
  execLoopTaskSwitcher = 1.nanoseconds
    ## Asynchroneous waiting time at the end of the exec loop unless some sleep
    ## seconds were added as decribed by `execLoopTimeElapsedMin`, above.
  execLoopPollingTime = 50.milliseconds
    ## Single asynchroneous time interval wait state for event polling
 # ------------------------------------------------------------------------------
 # Private helpers
 # ------------------------------------------------------------------------------
@ -129,67 +142,65 @@ proc workerLoop[S,W](buddy: RunnerBuddyRef[S,W]) {.async.} =
    peer = worker.peer
  # Continue until stopped
-  while not worker.ctrl.stopped:
+  block taskExecLoop:
-    if dsc.monitorLock:
+    while worker.ctrl.running:
-      await sleepAsync(50.milliseconds)
+      # Enforce minimum time spend on this loop
-      continue
+      let startMoment = Moment.now()
-    # Invoke `runPool()` over all buddies if requested
+      if dsc.monitorLock:
-    if ctx.poolMode:
+        discard # suspend some time at the end of loop body
-      # Grab `monitorLock` (was `false` as checked above) and wait until clear
+
-      # to run as the only activated instance.
+      # Invoke `runPool()` over all buddies if requested
-      dsc.monitorLock = true
+      elif ctx.poolMode:
-      block poolModeExec:
+        # Grab `monitorLock` (was `false` as checked above) and wait until
-        while 0 < dsc.activeMulti:
+        # clear to run as the only activated instance.
-          await sleepAsync(50.milliseconds)
+        dsc.monitorLock = true
        while 0 < dsc.activeMulti or dsc.singleRunLock:
          await sleepAsync execLoopPollingTime
          if worker.ctrl.stopped:
-            break poolModeExec
+            dsc.monitorLock = false
-        while dsc.singleRunLock:
+            break taskExecLoop
          await sleepAsync(50.milliseconds)
          if worker.ctrl.stopped:
            break poolModeExec
        var count = dsc.buddies.len
        for w in dsc.buddies.nextValues:
          count.dec
          worker.runPool(count == 0)
-        # End `block poolModeExec`
+        dsc.monitorLock = false
      dsc.monitorLock = false
      continue
-    # Rotate connection table so the most used entry is at the top/right
+      else:
-    # end. So zombies will end up leftish.
+        # Rotate connection table so the most used entry is at the top/right
-    discard dsc.buddies.lruFetch(peer.hash)
+        # end. So zombies will end up leftish.
        discard dsc.buddies.lruFetch(peer.hash)
-    # Allow task switch
+        # Multi mode
-    await sleepAsync(1.milliseconds)
+        if worker.ctrl.multiOk:
-    if worker.ctrl.stopped:
+          if not dsc.singleRunLock:
-      break
+            dsc.activeMulti.inc
            # Continue doing something, work a bit
            await worker.runMulti()
            dsc.activeMulti.dec
-    # Multi mode
+        elif dsc.singleRunLock:
-    if worker.ctrl.multiOk:
+          # Some other process is running single mode
-      if not dsc.singleRunLock:
+          discard # suspend some time at the end of loop body
        dsc.activeMulti.inc
        # Continue doing something, work a bit
        await worker.runMulti()
        dsc.activeMulti.dec
      continue
-    # Single mode as requested. The `multiOk` flag for this worker was just
+        else:
-    # found `false` in the pervious clause.
+          # Start single instance mode by grabbing `singleRunLock` (was
-    if not dsc.singleRunLock:
+          # `false` as checked above).
-      # Lock single instance mode and wait for other workers to finish
+          dsc.singleRunLock = true
-      dsc.singleRunLock = true
+          await worker.runSingle()
-      block singleModeExec:
+          dsc.singleRunLock = false
        while 0 < dsc.activeMulti:
          await sleepAsync(50.milliseconds)
          if worker.ctrl.stopped:
            break singleModeExec
        # Run single instance and release afterwards
        await worker.runSingle()
        # End `block singleModeExec`
      dsc.singleRunLock = false
-    # End while
+      if worker.ctrl.stopped:
        break taskExecLoop
      # Enforce minimum time spend on this loop so we never each 100% cpu load
      # caused by some empty sub-tasks which are out of this scheduler control.
      let
        elapsed = Moment.now() - startMoment
        suspend = if execLoopTimeElapsedMin <= elapsed: execLoopTaskSwitcher
                  else: execLoopTimeElapsedMin - elapsed
      await sleepAsync suspend
      # End while
  # Note that `runStart()` was dispatched in `onPeerConnected()`
  worker.runStop()