mirror of
https://github.com/logos-messaging/nim-sds.git
synced 2026-07-02 13:59:41 +00:00
refactor: drop asyncSpawn, add asyncSetup/asyncTeardown
Three asyncSpawn usages removed: - sds.nim startPeriodicTasks: stored the periodic-task futures on ReliabilityManager (new field `periodicTasks: seq[FutureBase]`) so cleanup can cancel them on shutdown instead of leaking the loops against a cleared manager. - library/sds_thread/sds_thread.nim: fireSync moved BEFORE processing, then `await SdsThreadRequest.process(...)` instead of asyncSpawn'ing it. Aligns the worker with the SP-channel + lock assumption that there are no concurrent requests; caller throughput is unchanged because the caller only waits for receipt (fireSync), not processing. - tests TestBus repair callback: replaced asyncSpawn(deliverExcept...) with an explicit pending-delivery queue drained by `bus.drain()`. Integration tests no longer rely on `sleepAsync(10ms)` to let spawned deliveries finish — they await drain instead. Tests also pick up an asyncSetup/asyncTeardown pair (tests/async_unittest.nim) so suite fixtures can `await` directly. All `waitFor` in setup/teardown blocks is gone; only the top-level asyncTest wrapper still uses waitFor (once, to drive the async proc to completion). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
965d67cb01
commit
3bcc09de3c
@ -43,13 +43,18 @@ proc runSds(ctx: ptr SdsContext) {.async.} =
|
||||
error "sds thread could not receive a request"
|
||||
continue
|
||||
|
||||
## Handle the request
|
||||
asyncSpawn SdsThreadRequest.process(request, addr rm)
|
||||
|
||||
## Ack receipt to the requester thread BEFORE processing — it only
|
||||
## waits for "received", not "processed", so the caller's throughput
|
||||
## doesn't change. Processing is then awaited (was: asyncSpawn'd),
|
||||
## which serializes requests on this worker. The SP channel + lock
|
||||
## above already assume no concurrent requests, so awaiting here
|
||||
## aligns the processing side with that assumption.
|
||||
let fireRes = ctx.reqReceivedSignal.fireSync()
|
||||
if fireRes.isErr():
|
||||
error "could not fireSync back to requester thread", error = fireRes.error
|
||||
|
||||
await SdsThreadRequest.process(request, addr rm)
|
||||
|
||||
proc run(ctx: ptr SdsContext) {.thread.} =
|
||||
## Launch sds worker
|
||||
waitFor runSds(ctx)
|
||||
|
||||
11
sds.nim
11
sds.nim
@ -511,10 +511,13 @@ proc periodicRepairSweep(
|
||||
await sleepAsync(chronos.milliseconds(rm.config.repairSweepInterval.inMilliseconds))
|
||||
|
||||
proc startPeriodicTasks*(rm: ReliabilityManager) =
|
||||
## Starts the periodic tasks for buffer sweeping and sync message sending.
|
||||
asyncSpawn rm.periodicBufferSweep()
|
||||
asyncSpawn rm.periodicSyncMessage()
|
||||
asyncSpawn rm.periodicRepairSweep()
|
||||
## Starts the periodic background tasks (buffer sweep, sync message,
|
||||
## SDS-R repair sweep). The futures are kept on the manager so `cleanup`
|
||||
## can cancel them — without that, the loops would outlive a cleaned-up
|
||||
## manager and keep firing against cleared state.
|
||||
rm.periodicTasks.add(FutureBase(rm.periodicBufferSweep()))
|
||||
rm.periodicTasks.add(FutureBase(rm.periodicSyncMessage()))
|
||||
rm.periodicTasks.add(FutureBase(rm.periodicRepairSweep()))
|
||||
|
||||
proc resetReliabilityManager*(
|
||||
rm: ReliabilityManager
|
||||
|
||||
@ -29,8 +29,16 @@ proc cleanup*(
|
||||
## reconstructed against the same backend after cleanup, so disk state must
|
||||
## survive. For deliberate disk wipe, use `removeChannel` or
|
||||
## `resetReliabilityManager`.
|
||||
##
|
||||
## Periodic tasks are cancelled BEFORE acquiring the lock so that a task
|
||||
## currently blocked on `lock.acquire()` can unwind via CancelledError
|
||||
## without deadlocking against cleanup itself.
|
||||
if rm.isNil():
|
||||
return
|
||||
for task in rm.periodicTasks:
|
||||
if not task.finished:
|
||||
await task.cancelAndWait()
|
||||
rm.periodicTasks.setLen(0)
|
||||
try:
|
||||
await rm.lock.acquire()
|
||||
try:
|
||||
|
||||
@ -21,6 +21,9 @@ type ReliabilityManager* = ref object
|
||||
## one another at await points; the manager assumes all calls come from
|
||||
## the same Chronos event loop (the FFI worker thread). Multi-OS-thread
|
||||
## use is the caller's responsibility.
|
||||
periodicTasks*: seq[FutureBase]
|
||||
## Handles to the background loops started by `startPeriodicTasks` so
|
||||
## `cleanup` can cancel them on shutdown instead of leaking them.
|
||||
onMessageReady*: proc(messageId: SdsMessageID, channelId: SdsChannelID) {.gcsafe.}
|
||||
onMessageSent*: proc(messageId: SdsMessageID, channelId: SdsChannelID) {.gcsafe.}
|
||||
onMissingDependencies*: proc(
|
||||
@ -48,5 +51,6 @@ proc new*(
|
||||
participantId: participantId,
|
||||
persistence: persistence,
|
||||
lock: newAsyncLock(),
|
||||
periodicTasks: @[],
|
||||
)
|
||||
return rm
|
||||
|
||||
69
tests/async_unittest.nim
Normal file
69
tests/async_unittest.nim
Normal file
@ -0,0 +1,69 @@
|
||||
## Shared async-aware wrappers around `unittest` so tests in this repo can
|
||||
## `await` directly in setup/test/teardown blocks instead of sprinkling
|
||||
## `waitFor` at each call site.
|
||||
##
|
||||
## Usage:
|
||||
##
|
||||
## ```nim
|
||||
## import ./async_unittest
|
||||
##
|
||||
## suite "X":
|
||||
## var rm: ReliabilityManager
|
||||
##
|
||||
## asyncSetup:
|
||||
## rm = newReliabilityManager(...).get()
|
||||
## check (await rm.ensureChannel("ch")).isOk()
|
||||
##
|
||||
## asyncTeardown:
|
||||
## if not rm.isNil:
|
||||
## await rm.cleanup()
|
||||
##
|
||||
## asyncTest "Y":
|
||||
## await rm.wrapOutgoingMessage(...)
|
||||
## ```
|
||||
##
|
||||
## All three blocks run inside the same async proc (per test). unittest's
|
||||
## own `setup:`/`teardown:` still work for purely synchronous fixtures.
|
||||
|
||||
import unittest, chronos
|
||||
export unittest, chronos
|
||||
|
||||
template asyncSetup*(body: untyped) {.dirty.} =
|
||||
## Async counterpart to unittest's `setup:`. Runs inside each asyncTest's
|
||||
## async proc, so `await` works.
|
||||
template asyncTestSetupIMPL(): untyped {.dirty.} =
|
||||
body
|
||||
|
||||
template asyncTeardown*(body: untyped) {.dirty.} =
|
||||
## Async counterpart to unittest's `teardown:`. Runs in a `finally` so it
|
||||
## executes even when the test body (or setup) raises.
|
||||
template asyncTestTeardownIMPL(): untyped {.dirty.} =
|
||||
body
|
||||
|
||||
template asyncTest*(name: string, body: untyped) =
|
||||
## Wraps a unittest `test` body in an async proc so `await` works on the
|
||||
## now-async ReliabilityManager API. unittest's `check` raises Exception,
|
||||
## which is wider than chronos's default CatchableError; the exception is
|
||||
## caught inside the async body, stashed, and re-raised after waitFor so
|
||||
## unittest's normal failure handling sees it.
|
||||
##
|
||||
## `cast(gcsafe)` is needed because suite-level vars (e.g. `var rm`) look
|
||||
## like globals to the async closure, but the FFI runtime is single-thread
|
||||
## so the "not gcsafe" warning isn't a real hazard here.
|
||||
test name:
|
||||
var asyncTestErr {.inject.}: ref Exception = nil
|
||||
proc inner() {.async.} =
|
||||
{.cast(gcsafe).}:
|
||||
try:
|
||||
when declared(asyncTestSetupIMPL):
|
||||
asyncTestSetupIMPL()
|
||||
try:
|
||||
body
|
||||
finally:
|
||||
when declared(asyncTestTeardownIMPL):
|
||||
asyncTestTeardownIMPL()
|
||||
except Exception as e:
|
||||
asyncTestErr = e
|
||||
waitFor inner()
|
||||
if asyncTestErr != nil:
|
||||
raise asyncTestErr
|
||||
@ -1,28 +1,12 @@
|
||||
import unittest, results, chronos, std/[tables, sets, times]
|
||||
import results, std/[tables, sets, times]
|
||||
import sds
|
||||
import ./async_unittest
|
||||
import ./in_memory_persistence
|
||||
|
||||
converter toParticipantID(s: string): SdsParticipantID = s.SdsParticipantID
|
||||
|
||||
const testChannel = "testChannel"
|
||||
|
||||
template asyncTest(name: string, body: untyped) =
|
||||
## Wraps a unittest `test` body in an async proc and runs it to completion.
|
||||
## Tests can now `await` rm.* and rm.persistence.* calls directly.
|
||||
## unittest's `check` raises Exception (wider than chronos's CatchableError),
|
||||
## so catch inside the async body and re-raise after waitFor.
|
||||
test name:
|
||||
var asyncTestErr {.inject.}: ref Exception = nil
|
||||
proc inner() {.async.} =
|
||||
{.cast(gcsafe).}:
|
||||
try:
|
||||
body
|
||||
except Exception as e:
|
||||
asyncTestErr = e
|
||||
waitFor inner()
|
||||
if asyncTestErr != nil:
|
||||
raise asyncTestErr
|
||||
|
||||
suite "Persistence: write → restart → read-back":
|
||||
asyncTest "outgoing buffer survives restart":
|
||||
let store = newInMemoryStore()
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import unittest, results, chronos, std/[times, options, tables]
|
||||
import results, std/[times, options, tables]
|
||||
import sds
|
||||
import ./async_unittest
|
||||
|
||||
# Test-only convenience: implicit string → SdsParticipantID so test fixtures
|
||||
# can use string literals. Production code retains the distinct-type safety.
|
||||
@ -7,28 +8,6 @@ converter toParticipantID(s: string): SdsParticipantID = s.SdsParticipantID
|
||||
|
||||
const testChannel = "testChannel"
|
||||
|
||||
template asyncTest(name: string, body: untyped) =
|
||||
## Wraps a unittest `test` body in an async proc so tests can `await` the
|
||||
## now-async ReliabilityManager API directly. Setup/teardown blocks still
|
||||
## run in the outer (sync) scope — use `waitFor` for any async calls there.
|
||||
## unittest's `check` raises `Exception`, which is wider than chronos's
|
||||
## default `CatchableError` for async procs — so we catch it inside and
|
||||
## re-raise after waitFor, where unittest's normal handling can see it.
|
||||
## cast(gcsafe) is needed because suite-level `var rm` looks like a global
|
||||
## to the closure capture, but the FFI runtime is single-threaded so the
|
||||
## "not gcsafe" warning isn't a real hazard here.
|
||||
test name:
|
||||
var asyncTestErr {.inject.}: ref Exception = nil
|
||||
proc inner() {.async.} =
|
||||
{.cast(gcsafe).}:
|
||||
try:
|
||||
body
|
||||
except Exception as e:
|
||||
asyncTestErr = e
|
||||
waitFor inner()
|
||||
if asyncTestErr != nil:
|
||||
raise asyncTestErr
|
||||
|
||||
proc seedBloom(
|
||||
rm: ReliabilityManager, channel: SdsChannelID, n: int, prefix = "noise"
|
||||
) =
|
||||
@ -43,15 +22,15 @@ proc seedBloom(
|
||||
suite "Core Operations":
|
||||
var rm: ReliabilityManager
|
||||
|
||||
setup:
|
||||
asyncSetup:
|
||||
let rmResult = newReliabilityManager(participantId = "alice")
|
||||
check rmResult.isOk()
|
||||
rm = rmResult.get()
|
||||
check (waitFor rm.ensureChannel(testChannel)).isOk()
|
||||
check (await rm.ensureChannel(testChannel)).isOk()
|
||||
|
||||
teardown:
|
||||
asyncTeardown:
|
||||
if not rm.isNil:
|
||||
waitFor rm.cleanup()
|
||||
await rm.cleanup()
|
||||
|
||||
asyncTest "can create with default config":
|
||||
let config = defaultConfig()
|
||||
@ -140,15 +119,15 @@ suite "Core Operations":
|
||||
suite "Reliability Mechanisms":
|
||||
var rm: ReliabilityManager
|
||||
|
||||
setup:
|
||||
asyncSetup:
|
||||
let rmResult = newReliabilityManager(participantId = "alice")
|
||||
check rmResult.isOk()
|
||||
rm = rmResult.get()
|
||||
check (waitFor rm.ensureChannel(testChannel)).isOk()
|
||||
check (await rm.ensureChannel(testChannel)).isOk()
|
||||
|
||||
teardown:
|
||||
asyncTeardown:
|
||||
if not rm.isNil:
|
||||
waitFor rm.cleanup()
|
||||
await rm.cleanup()
|
||||
|
||||
asyncTest "dependency detection and resolution":
|
||||
var messageReadyCount = 0
|
||||
@ -558,15 +537,15 @@ suite "Reliability Mechanisms":
|
||||
suite "Periodic Tasks & Buffer Management":
|
||||
var rm: ReliabilityManager
|
||||
|
||||
setup:
|
||||
asyncSetup:
|
||||
let rmResult = newReliabilityManager(participantId = "alice")
|
||||
check rmResult.isOk()
|
||||
rm = rmResult.get()
|
||||
check (waitFor rm.ensureChannel(testChannel)).isOk()
|
||||
check (await rm.ensureChannel(testChannel)).isOk()
|
||||
|
||||
teardown:
|
||||
asyncTeardown:
|
||||
if not rm.isNil:
|
||||
waitFor rm.cleanup()
|
||||
await rm.cleanup()
|
||||
|
||||
asyncTest "outgoing buffer management":
|
||||
var messageSentCount = 0
|
||||
@ -696,15 +675,15 @@ suite "Periodic Tasks & Buffer Management":
|
||||
suite "Special Cases Handling":
|
||||
var rm: ReliabilityManager
|
||||
|
||||
setup:
|
||||
asyncSetup:
|
||||
let rmResult = newReliabilityManager(participantId = "alice")
|
||||
check rmResult.isOk()
|
||||
rm = rmResult.get()
|
||||
check (waitFor rm.ensureChannel(testChannel)).isOk()
|
||||
check (await rm.ensureChannel(testChannel)).isOk()
|
||||
|
||||
teardown:
|
||||
asyncTeardown:
|
||||
if not rm.isNil:
|
||||
waitFor rm.cleanup()
|
||||
await rm.cleanup()
|
||||
|
||||
asyncTest "message history limits":
|
||||
# Add messages up to max history size
|
||||
@ -810,14 +789,14 @@ suite "cleanup":
|
||||
suite "Multi-Channel ReliabilityManager Tests":
|
||||
var rm: ReliabilityManager
|
||||
|
||||
setup:
|
||||
asyncSetup:
|
||||
let rmResult = newReliabilityManager(participantId = "alice")
|
||||
check rmResult.isOk()
|
||||
rm = rmResult.get()
|
||||
|
||||
teardown:
|
||||
asyncTeardown:
|
||||
if not rm.isNil:
|
||||
waitFor rm.cleanup()
|
||||
await rm.cleanup()
|
||||
|
||||
asyncTest "can create multi-channel manager without channel ID":
|
||||
check rm.channels.len == 0
|
||||
@ -1050,17 +1029,17 @@ suite "SDS-R: Computation Functions":
|
||||
suite "SDS-R: Repair Buffer Management":
|
||||
var rm: ReliabilityManager
|
||||
|
||||
setup:
|
||||
asyncSetup:
|
||||
let rmResult = newReliabilityManager(
|
||||
participantId = "test-participant"
|
||||
)
|
||||
check rmResult.isOk()
|
||||
rm = rmResult.get()
|
||||
check (waitFor rm.ensureChannel(testChannel)).isOk()
|
||||
check (await rm.ensureChannel(testChannel)).isOk()
|
||||
|
||||
teardown:
|
||||
asyncTeardown:
|
||||
if not rm.isNil:
|
||||
waitFor rm.cleanup()
|
||||
await rm.cleanup()
|
||||
|
||||
asyncTest "missing deps added to outgoing repair buffer":
|
||||
var missingDepsCount = 0
|
||||
@ -1580,13 +1559,13 @@ suite "SDS-R: Lifecycle and State":
|
||||
suite "SDS-R: Repair Sweep":
|
||||
var rm: ReliabilityManager
|
||||
|
||||
setup:
|
||||
asyncSetup:
|
||||
rm = newReliabilityManager(participantId = "bob").get()
|
||||
check (waitFor rm.ensureChannel(testChannel)).isOk()
|
||||
check (await rm.ensureChannel(testChannel)).isOk()
|
||||
|
||||
teardown:
|
||||
asyncTeardown:
|
||||
if not rm.isNil:
|
||||
waitFor rm.cleanup()
|
||||
await rm.cleanup()
|
||||
|
||||
asyncTest "runRepairSweep fires onRepairReady for expired tResp":
|
||||
var fireCount = 0
|
||||
@ -1666,12 +1645,16 @@ type
|
||||
delivered: Table[SdsParticipantID, seq[SdsMessageID]]
|
||||
# Log of raw message-ids placed on the wire, tagged with the source peer.
|
||||
wireLog: seq[tuple[senderId: SdsParticipantID, messageId: SdsMessageID]]
|
||||
# Queue of (sender, bytes) the repair callback would have delivered if it
|
||||
# could await. Drained explicitly by `bus.drain()` from the test body.
|
||||
pending: seq[(SdsParticipantID, seq[byte])]
|
||||
|
||||
proc newTestBus(): TestBus =
|
||||
TestBus(
|
||||
peers: initOrderedTable[SdsParticipantID, ReliabilityManager](),
|
||||
delivered: initTable[SdsParticipantID, seq[SdsMessageID]](),
|
||||
wireLog: @[],
|
||||
pending: @[],
|
||||
)
|
||||
|
||||
proc recordWire(bus: TestBus, senderId: SdsParticipantID, bytes: seq[byte]) {.gcsafe.} =
|
||||
@ -1690,6 +1673,16 @@ proc deliverExcept(
|
||||
continue
|
||||
discard await peer.unwrapReceivedMessage(bytes)
|
||||
|
||||
proc drain(bus: TestBus): Future[void] {.async.} =
|
||||
## Delivers every (sender, bytes) the repair callback enqueued. Loops until
|
||||
## the queue stays empty across one full pass — a delivery may trigger a
|
||||
## new repair-ready callback that re-enqueues.
|
||||
while bus.pending.len > 0:
|
||||
let batch = move bus.pending
|
||||
bus.pending = @[]
|
||||
for entry in batch:
|
||||
await bus.deliverExcept(entry[0], entry[1], @[])
|
||||
|
||||
proc addPeer(
|
||||
bus: TestBus,
|
||||
participantId: SdsParticipantID,
|
||||
@ -1709,12 +1702,11 @@ proc addPeer(
|
||||
proc(msgId: SdsMessageID, ch: SdsChannelID) {.gcsafe.} = discard,
|
||||
proc(msgId: SdsMessageID, deps: seq[HistoryEntry], ch: SdsChannelID) {.gcsafe.} = discard,
|
||||
onRepairReady = proc(bytes: seq[byte], ch: SdsChannelID) {.gcsafe.} =
|
||||
# The callback contract is sync, so we cannot `await` here. Enqueue the
|
||||
# delivery and let the test drive it via `bus.drain()` instead.
|
||||
{.cast(gcsafe).}:
|
||||
busRef.recordWire(pid, bytes)
|
||||
# Fire-and-forget delivery from a sync callback context — we cannot
|
||||
# await here, so spawn the async delivery onto the same event loop.
|
||||
asyncSpawn(busRef.deliverExcept(pid, bytes, @[]))
|
||||
,
|
||||
busRef.pending.add((pid, bytes)),
|
||||
)
|
||||
return rm
|
||||
|
||||
@ -1786,9 +1778,7 @@ suite "SDS-R: Multi-Participant Integration":
|
||||
# then run her sweep. She rebroadcasts M1.
|
||||
alice.forceIncomingExpired("m1")
|
||||
await alice.runRepairSweep()
|
||||
|
||||
# Allow any asyncSpawn'd deliveries from the repair callback to run.
|
||||
await sleepAsync(chronos.milliseconds(10))
|
||||
await bus.drain()
|
||||
|
||||
# Bob now has M1 and M2 delivered.
|
||||
check:
|
||||
@ -1820,7 +1810,7 @@ suite "SDS-R: Multi-Participant Integration":
|
||||
# pending entry when Carol receives the rebroadcast.
|
||||
alice.forceIncomingExpired("m1")
|
||||
await alice.runRepairSweep()
|
||||
await sleepAsync(chronos.milliseconds(10))
|
||||
await bus.drain()
|
||||
|
||||
# Carol's pending response must have been cleared by the dedup-path cleanup.
|
||||
check "m1" notin carol.channels[testChannel].incomingRepairBuffer
|
||||
@ -1828,7 +1818,7 @@ suite "SDS-R: Multi-Participant Integration":
|
||||
# Even if we now force-run Carol's sweep, nothing should fire.
|
||||
let wireCountBefore = bus.wireLog.len
|
||||
await carol.runRepairSweep()
|
||||
await sleepAsync(chronos.milliseconds(10))
|
||||
await bus.drain()
|
||||
check bus.wireLog.len == wireCountBefore
|
||||
|
||||
# Bob received exactly one rebroadcast of M1.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user