refactor: drop asyncSpawn, add asyncSetup/asyncTeardown

Three asyncSpawn usages removed: - sds.nim startPeriodicTasks: stored the periodic-task futures on ReliabilityManager (new field `periodicTasks: seq[FutureBase]`) so cleanup can cancel them on shutdown instead of leaking the loops against a cleared manager. - library/sds_thread/sds_thread.nim: fireSync moved BEFORE processing, then `await SdsThreadRequest.process(...)` instead of asyncSpawn'ing it. Aligns the worker with the SP-channel + lock assumption that there are no concurrent requests; caller throughput is unchanged because the caller only waits for receipt (fireSync), not processing. - tests TestBus repair callback: replaced asyncSpawn(deliverExcept...) with an explicit pending-delivery queue drained by `bus.drain()`. Integration tests no longer rely on `sleepAsync(10ms)` to let spawned deliveries finish — they await drain instead. Tests also pick up an asyncSetup/asyncTeardown pair (tests/async_unittest.nim) so suite fixtures can `await` directly. All `waitFor` in setup/teardown blocks is gone; only the top-level asyncTest wrapper still uses waitFor (once, to drive the async proc to completion). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-07-02 13:59:41 +00:00 · 2026-05-20 18:02:49 +02:00 · 2026-05-20 18:02:49 +02:00 · 3bcc09de3c
commit 3bcc09de3c
parent 965d67cb01
7 changed files with 147 additions and 84 deletions
--- a/library/sds_thread/sds_thread.nim
+++ b/library/sds_thread/sds_thread.nim
@ -43,13 +43,18 @@ proc runSds(ctx: ptr SdsContext) {.async.} =
      error "sds thread could not receive a request"
      continue

-    ## Handle the request
-    asyncSpawn SdsThreadRequest.process(request, addr rm)
-
+    ## Ack receipt to the requester thread BEFORE processing — it only
+    ## waits for "received", not "processed", so the caller's throughput
+    ## doesn't change. Processing is then awaited (was: asyncSpawn'd),
+    ## which serializes requests on this worker. The SP channel + lock
+    ## above already assume no concurrent requests, so awaiting here
+    ## aligns the processing side with that assumption.
    let fireRes = ctx.reqReceivedSignal.fireSync()
    if fireRes.isErr():
      error "could not fireSync back to requester thread", error = fireRes.error

+    await SdsThreadRequest.process(request, addr rm)
+
 proc run(ctx: ptr SdsContext) {.thread.} =
  ## Launch sds worker
  waitFor runSds(ctx)
--- a/sds.nim
+++ b/sds.nim
@ -511,10 +511,13 @@ proc periodicRepairSweep(
    await sleepAsync(chronos.milliseconds(rm.config.repairSweepInterval.inMilliseconds))

 proc startPeriodicTasks*(rm: ReliabilityManager) =
-  ## Starts the periodic tasks for buffer sweeping and sync message sending.
-  asyncSpawn rm.periodicBufferSweep()
-  asyncSpawn rm.periodicSyncMessage()
-  asyncSpawn rm.periodicRepairSweep()
+  ## Starts the periodic background tasks (buffer sweep, sync message,
+  ## SDS-R repair sweep). The futures are kept on the manager so `cleanup`
+  ## can cancel them — without that, the loops would outlive a cleaned-up
+  ## manager and keep firing against cleared state.
+  rm.periodicTasks.add(FutureBase(rm.periodicBufferSweep()))
+  rm.periodicTasks.add(FutureBase(rm.periodicSyncMessage()))
+  rm.periodicTasks.add(FutureBase(rm.periodicRepairSweep()))

 proc resetReliabilityManager*(
    rm: ReliabilityManager
--- a/sds/sds_utils.nim
+++ b/sds/sds_utils.nim
@ -29,8 +29,16 @@ proc cleanup*(
  ## reconstructed against the same backend after cleanup, so disk state must
  ## survive. For deliberate disk wipe, use `removeChannel` or
  ## `resetReliabilityManager`.
+  ##
+  ## Periodic tasks are cancelled BEFORE acquiring the lock so that a task
+  ## currently blocked on `lock.acquire()` can unwind via CancelledError
+  ## without deadlocking against cleanup itself.
  if rm.isNil():
    return
+  for task in rm.periodicTasks:
+    if not task.finished:
+      await task.cancelAndWait()
+  rm.periodicTasks.setLen(0)
  try:
    await rm.lock.acquire()
    try:
--- a/sds/types/reliability_manager.nim
+++ b/sds/types/reliability_manager.nim
@ -21,6 +21,9 @@ type ReliabilityManager* = ref object
    ## one another at await points; the manager assumes all calls come from
    ## the same Chronos event loop (the FFI worker thread). Multi-OS-thread
    ## use is the caller's responsibility.
+  periodicTasks*: seq[FutureBase]
+    ## Handles to the background loops started by `startPeriodicTasks` so
+    ## `cleanup` can cancel them on shutdown instead of leaking them.
  onMessageReady*: proc(messageId: SdsMessageID, channelId: SdsChannelID) {.gcsafe.}
  onMessageSent*: proc(messageId: SdsMessageID, channelId: SdsChannelID) {.gcsafe.}
  onMissingDependencies*: proc(
@ -48,5 +51,6 @@ proc new*(
    participantId: participantId,
    persistence: persistence,
    lock: newAsyncLock(),
+    periodicTasks: @[],
  )
  return rm
--- a/tests/async_unittest.nim
+++ b/tests/async_unittest.nim
@ -0,0 +1,69 @@
+## Shared async-aware wrappers around `unittest` so tests in this repo can
+## `await` directly in setup/test/teardown blocks instead of sprinkling
+## `waitFor` at each call site.
+##
+## Usage:
+##
+## ```nim
+## import ./async_unittest
+##
+## suite "X":
+##   var rm: ReliabilityManager
+##
+##   asyncSetup:
+##     rm = newReliabilityManager(...).get()
+##     check (await rm.ensureChannel("ch")).isOk()
+##
+##   asyncTeardown:
+##     if not rm.isNil:
+##       await rm.cleanup()
+##
+##   asyncTest "Y":
+##     await rm.wrapOutgoingMessage(...)
+## ```
+##
+## All three blocks run inside the same async proc (per test). unittest's
+## own `setup:`/`teardown:` still work for purely synchronous fixtures.
+
+import unittest, chronos
+export unittest, chronos
+
+template asyncSetup*(body: untyped) {.dirty.} =
+  ## Async counterpart to unittest's `setup:`. Runs inside each asyncTest's
+  ## async proc, so `await` works.
+  template asyncTestSetupIMPL(): untyped {.dirty.} =
+    body
+
+template asyncTeardown*(body: untyped) {.dirty.} =
+  ## Async counterpart to unittest's `teardown:`. Runs in a `finally` so it
+  ## executes even when the test body (or setup) raises.
+  template asyncTestTeardownIMPL(): untyped {.dirty.} =
+    body
+
+template asyncTest*(name: string, body: untyped) =
+  ## Wraps a unittest `test` body in an async proc so `await` works on the
+  ## now-async ReliabilityManager API. unittest's `check` raises Exception,
+  ## which is wider than chronos's default CatchableError; the exception is
+  ## caught inside the async body, stashed, and re-raised after waitFor so
+  ## unittest's normal failure handling sees it.
+  ##
+  ## `cast(gcsafe)` is needed because suite-level vars (e.g. `var rm`) look
+  ## like globals to the async closure, but the FFI runtime is single-thread
+  ## so the "not gcsafe" warning isn't a real hazard here.
+  test name:
+    var asyncTestErr {.inject.}: ref Exception = nil
+    proc inner() {.async.} =
+      {.cast(gcsafe).}:
+        try:
+          when declared(asyncTestSetupIMPL):
+            asyncTestSetupIMPL()
+          try:
+            body
+          finally:
+            when declared(asyncTestTeardownIMPL):
+              asyncTestTeardownIMPL()
+        except Exception as e:
+          asyncTestErr = e
+    waitFor inner()
+    if asyncTestErr != nil:
+      raise asyncTestErr
--- a/tests/test_persistence.nim
+++ b/tests/test_persistence.nim
@ -1,28 +1,12 @@
-import unittest, results, chronos, std/[tables, sets, times]
+import results, std/[tables, sets, times]
 import sds
+import ./async_unittest
 import ./in_memory_persistence

 converter toParticipantID(s: string): SdsParticipantID = s.SdsParticipantID

 const testChannel = "testChannel"

-template asyncTest(name: string, body: untyped) =
-  ## Wraps a unittest `test` body in an async proc and runs it to completion.
-  ## Tests can now `await` rm.* and rm.persistence.* calls directly.
-  ## unittest's `check` raises Exception (wider than chronos's CatchableError),
-  ## so catch inside the async body and re-raise after waitFor.
-  test name:
-    var asyncTestErr {.inject.}: ref Exception = nil
-    proc inner() {.async.} =
-      {.cast(gcsafe).}:
-        try:
-          body
-        except Exception as e:
-          asyncTestErr = e
-    waitFor inner()
-    if asyncTestErr != nil:
-      raise asyncTestErr
-
 suite "Persistence: write → restart → read-back":
  asyncTest "outgoing buffer survives restart":
    let store = newInMemoryStore()
--- a/tests/test_reliability.nim
+++ b/tests/test_reliability.nim
@ -1,5 +1,6 @@
-import unittest, results, chronos, std/[times, options, tables]
+import results, std/[times, options, tables]
 import sds
+import ./async_unittest

 # Test-only convenience: implicit string → SdsParticipantID so test fixtures
 # can use string literals. Production code retains the distinct-type safety.
@ -7,28 +8,6 @@ converter toParticipantID(s: string): SdsParticipantID = s.SdsParticipantID

 const testChannel = "testChannel"

-template asyncTest(name: string, body: untyped) =
-  ## Wraps a unittest `test` body in an async proc so tests can `await` the
-  ## now-async ReliabilityManager API directly. Setup/teardown blocks still
-  ## run in the outer (sync) scope — use `waitFor` for any async calls there.
-  ## unittest's `check` raises `Exception`, which is wider than chronos's
-  ## default `CatchableError` for async procs — so we catch it inside and
-  ## re-raise after waitFor, where unittest's normal handling can see it.
-  ## cast(gcsafe) is needed because suite-level `var rm` looks like a global
-  ## to the closure capture, but the FFI runtime is single-threaded so the
-  ## "not gcsafe" warning isn't a real hazard here.
-  test name:
-    var asyncTestErr {.inject.}: ref Exception = nil
-    proc inner() {.async.} =
-      {.cast(gcsafe).}:
-        try:
-          body
-        except Exception as e:
-          asyncTestErr = e
-    waitFor inner()
-    if asyncTestErr != nil:
-      raise asyncTestErr
-
 proc seedBloom(
    rm: ReliabilityManager, channel: SdsChannelID, n: int, prefix = "noise"
 ) =
@ -43,15 +22,15 @@ proc seedBloom(
 suite "Core Operations":
  var rm: ReliabilityManager

-  setup:
+  asyncSetup:
    let rmResult = newReliabilityManager(participantId = "alice")
    check rmResult.isOk()
    rm = rmResult.get()
-    check (waitFor rm.ensureChannel(testChannel)).isOk()
+    check (await rm.ensureChannel(testChannel)).isOk()

-  teardown:
+  asyncTeardown:
    if not rm.isNil:
-      waitFor rm.cleanup()
+      await rm.cleanup()

  asyncTest "can create with default config":
    let config = defaultConfig()
@ -140,15 +119,15 @@ suite "Core Operations":
 suite "Reliability Mechanisms":
  var rm: ReliabilityManager

-  setup:
+  asyncSetup:
    let rmResult = newReliabilityManager(participantId = "alice")
    check rmResult.isOk()
    rm = rmResult.get()
-    check (waitFor rm.ensureChannel(testChannel)).isOk()
+    check (await rm.ensureChannel(testChannel)).isOk()

-  teardown:
+  asyncTeardown:
    if not rm.isNil:
-      waitFor rm.cleanup()
+      await rm.cleanup()

  asyncTest "dependency detection and resolution":
    var messageReadyCount = 0
@ -558,15 +537,15 @@ suite "Reliability Mechanisms":
 suite "Periodic Tasks & Buffer Management":
  var rm: ReliabilityManager

-  setup:
+  asyncSetup:
    let rmResult = newReliabilityManager(participantId = "alice")
    check rmResult.isOk()
    rm = rmResult.get()
-    check (waitFor rm.ensureChannel(testChannel)).isOk()
+    check (await rm.ensureChannel(testChannel)).isOk()

-  teardown:
+  asyncTeardown:
    if not rm.isNil:
-      waitFor rm.cleanup()
+      await rm.cleanup()

  asyncTest "outgoing buffer management":
    var messageSentCount = 0
@ -696,15 +675,15 @@ suite "Periodic Tasks & Buffer Management":
 suite "Special Cases Handling":
  var rm: ReliabilityManager

-  setup:
+  asyncSetup:
    let rmResult = newReliabilityManager(participantId = "alice")
    check rmResult.isOk()
    rm = rmResult.get()
-    check (waitFor rm.ensureChannel(testChannel)).isOk()
+    check (await rm.ensureChannel(testChannel)).isOk()

-  teardown:
+  asyncTeardown:
    if not rm.isNil:
-      waitFor rm.cleanup()
+      await rm.cleanup()

  asyncTest "message history limits":
    # Add messages up to max history size
@ -810,14 +789,14 @@ suite "cleanup":
 suite "Multi-Channel ReliabilityManager Tests":
  var rm: ReliabilityManager

-  setup:
+  asyncSetup:
    let rmResult = newReliabilityManager(participantId = "alice")
    check rmResult.isOk()
    rm = rmResult.get()

-  teardown:
+  asyncTeardown:
    if not rm.isNil:
-      waitFor rm.cleanup()
+      await rm.cleanup()

  asyncTest "can create multi-channel manager without channel ID":
    check rm.channels.len == 0
@ -1050,17 +1029,17 @@ suite "SDS-R: Computation Functions":
 suite "SDS-R: Repair Buffer Management":
  var rm: ReliabilityManager

-  setup:
+  asyncSetup:
    let rmResult = newReliabilityManager(
      participantId = "test-participant"
    )
    check rmResult.isOk()
    rm = rmResult.get()
-    check (waitFor rm.ensureChannel(testChannel)).isOk()
+    check (await rm.ensureChannel(testChannel)).isOk()

-  teardown:
+  asyncTeardown:
    if not rm.isNil:
-      waitFor rm.cleanup()
+      await rm.cleanup()

  asyncTest "missing deps added to outgoing repair buffer":
    var missingDepsCount = 0
@ -1580,13 +1559,13 @@ suite "SDS-R: Lifecycle and State":
 suite "SDS-R: Repair Sweep":
  var rm: ReliabilityManager

-  setup:
+  asyncSetup:
    rm = newReliabilityManager(participantId = "bob").get()
-    check (waitFor rm.ensureChannel(testChannel)).isOk()
+    check (await rm.ensureChannel(testChannel)).isOk()

-  teardown:
+  asyncTeardown:
    if not rm.isNil:
-      waitFor rm.cleanup()
+      await rm.cleanup()

  asyncTest "runRepairSweep fires onRepairReady for expired tResp":
    var fireCount = 0
@ -1666,12 +1645,16 @@ type
    delivered: Table[SdsParticipantID, seq[SdsMessageID]]
    # Log of raw message-ids placed on the wire, tagged with the source peer.
    wireLog: seq[tuple[senderId: SdsParticipantID, messageId: SdsMessageID]]
+    # Queue of (sender, bytes) the repair callback would have delivered if it
+    # could await. Drained explicitly by `bus.drain()` from the test body.
+    pending: seq[(SdsParticipantID, seq[byte])]

 proc newTestBus(): TestBus =
  TestBus(
    peers: initOrderedTable[SdsParticipantID, ReliabilityManager](),
    delivered: initTable[SdsParticipantID, seq[SdsMessageID]](),
    wireLog: @[],
+    pending: @[],
  )

 proc recordWire(bus: TestBus, senderId: SdsParticipantID, bytes: seq[byte]) {.gcsafe.} =
@ -1690,6 +1673,16 @@ proc deliverExcept(
      continue
    discard await peer.unwrapReceivedMessage(bytes)

+proc drain(bus: TestBus): Future[void] {.async.} =
+  ## Delivers every (sender, bytes) the repair callback enqueued. Loops until
+  ## the queue stays empty across one full pass — a delivery may trigger a
+  ## new repair-ready callback that re-enqueues.
+  while bus.pending.len > 0:
+    let batch = move bus.pending
+    bus.pending = @[]
+    for entry in batch:
+      await bus.deliverExcept(entry[0], entry[1], @[])
+
 proc addPeer(
    bus: TestBus,
    participantId: SdsParticipantID,
@ -1709,12 +1702,11 @@ proc addPeer(
    proc(msgId: SdsMessageID, ch: SdsChannelID) {.gcsafe.} = discard,
    proc(msgId: SdsMessageID, deps: seq[HistoryEntry], ch: SdsChannelID) {.gcsafe.} = discard,
    onRepairReady = proc(bytes: seq[byte], ch: SdsChannelID) {.gcsafe.} =
+      # The callback contract is sync, so we cannot `await` here. Enqueue the
+      # delivery and let the test drive it via `bus.drain()` instead.
      {.cast(gcsafe).}:
        busRef.recordWire(pid, bytes)
-        # Fire-and-forget delivery from a sync callback context — we cannot
-        # await here, so spawn the async delivery onto the same event loop.
-        asyncSpawn(busRef.deliverExcept(pid, bytes, @[]))
-    ,
+        busRef.pending.add((pid, bytes)),
  )
  return rm

@ -1786,9 +1778,7 @@ suite "SDS-R: Multi-Participant Integration":
    # then run her sweep. She rebroadcasts M1.
    alice.forceIncomingExpired("m1")
    await alice.runRepairSweep()
-
-    # Allow any asyncSpawn'd deliveries from the repair callback to run.
-    await sleepAsync(chronos.milliseconds(10))
+    await bus.drain()

    # Bob now has M1 and M2 delivered.
    check:
@ -1820,7 +1810,7 @@ suite "SDS-R: Multi-Participant Integration":
    # pending entry when Carol receives the rebroadcast.
    alice.forceIncomingExpired("m1")
    await alice.runRepairSweep()
-    await sleepAsync(chronos.milliseconds(10))
+    await bus.drain()

    # Carol's pending response must have been cleared by the dedup-path cleanup.
    check "m1" notin carol.channels[testChannel].incomingRepairBuffer
@ -1828,7 +1818,7 @@ suite "SDS-R: Multi-Participant Integration":
    # Even if we now force-run Carol's sweep, nothing should fire.
    let wireCountBefore = bus.wireLog.len
    await carol.runRepairSweep()
-    await sleepAsync(chronos.milliseconds(10))
+    await bus.drain()
    check bus.wireLog.len == wireCountBefore

    # Bob received exactly one rebroadcast of M1.