mirror of
https://github.com/logos-messaging/nim-ffi.git
synced 2026-06-30 13:19:31 +00:00
Co-authored-by: Gabriel Cruz <8129788+gmelodie@users.noreply.github.com>
This commit is contained in:
parent
3d7b267ef0
commit
5d49ee6b08
@ -129,12 +129,13 @@ pub struct MyTimerCtx {
|
||||
|
||||
// SAFETY: The `ptr` field points to an FFIContext owned by the Nim runtime.
|
||||
// Every call through the generated FFI proc goes through
|
||||
// `sendRequestToFFIThread` on the Nim side, which serialises every request
|
||||
// behind `ctx.lock` and dispatches handlers on a single FFI thread, so the
|
||||
// pointer is never accessed concurrently from Rust. The Nim-side reentrancy
|
||||
// guard (`onFFIThread` threadvar) prevents handlers from re-entering the
|
||||
// dispatcher and self-deadlocking. These invariants make it sound to mark
|
||||
// the wrapper as Send + Sync.
|
||||
// `sendRequestToFFIThread` on the Nim side, which only enqueues the request
|
||||
// onto a mutex-guarded MPSC queue (sound from any number of threads) and
|
||||
// wakes the single FFI thread that dispatches every handler. The context is
|
||||
// thus never mutated non-atomically from the caller's thread. The Nim-side
|
||||
// reentrancy guard (`onFFIThread` threadvar) prevents handlers from
|
||||
// re-entering the dispatcher. These invariants make it sound to mark the
|
||||
// wrapper as Send + Sync.
|
||||
unsafe impl Send for MyTimerCtx {}
|
||||
unsafe impl Sync for MyTimerCtx {}
|
||||
|
||||
|
||||
@ -505,19 +505,18 @@ proc generateApiRs*(
|
||||
)
|
||||
lines.add("// Every call through the generated FFI proc goes through")
|
||||
lines.add(
|
||||
"// `sendRequestToFFIThread` on the Nim side, which serialises every request"
|
||||
"// `sendRequestToFFIThread` on the Nim side, which only enqueues the request"
|
||||
)
|
||||
lines.add("// onto a mutex-guarded MPSC queue (sound from any number of threads) and")
|
||||
lines.add(
|
||||
"// wakes the single FFI thread that dispatches every handler. The context is"
|
||||
)
|
||||
lines.add(
|
||||
"// behind `ctx.lock` and dispatches handlers on a single FFI thread, so the"
|
||||
"// thus never mutated non-atomically from the caller's thread. The Nim-side"
|
||||
)
|
||||
lines.add(
|
||||
"// pointer is never accessed concurrently from Rust. The Nim-side reentrancy"
|
||||
)
|
||||
lines.add("// guard (`onFFIThread` threadvar) prevents handlers from re-entering the")
|
||||
lines.add(
|
||||
"// dispatcher and self-deadlocking. These invariants make it sound to mark"
|
||||
)
|
||||
lines.add("// the wrapper as Send + Sync.")
|
||||
lines.add("// reentrancy guard (`onFFIThread` threadvar) prevents handlers from")
|
||||
lines.add("// re-entering the dispatcher. These invariants make it sound to mark the")
|
||||
lines.add("// wrapper as Send + Sync.")
|
||||
lines.add("unsafe impl Send for $1 {}" % [ctxTypeName])
|
||||
lines.add("unsafe impl Sync for $1 {}" % [ctxTypeName])
|
||||
lines.add("")
|
||||
|
||||
@ -7,12 +7,13 @@
|
||||
{.passc: "-fPIC".}
|
||||
|
||||
import std/[atomics, locks, options, tables]
|
||||
import chronicles, chronos, chronos/threadsync, taskpools/channels_spsc_single, results
|
||||
import chronicles, chronos, chronos/threadsync, results
|
||||
import
|
||||
./ffi_types,
|
||||
./ffi_events,
|
||||
./ffi_handles,
|
||||
./ffi_thread_request,
|
||||
./ffi_request_queue,
|
||||
./logging,
|
||||
./cbor_serial
|
||||
|
||||
@ -22,10 +23,8 @@ type FFIContext*[T] = object
|
||||
myLib*: ptr T # main library object (Waku, LibP2P, SDS, …)
|
||||
ffiThread: Thread[(ptr FFIContext[T])]
|
||||
eventThread: Thread[(ptr FFIContext[T])]
|
||||
lock: Lock
|
||||
reqChannel: ChannelSPSCSingle[ptr FFIThreadRequest]
|
||||
reqSignal: ThreadSignalPtr
|
||||
reqReceivedSignal: ThreadSignalPtr
|
||||
reqQueueBank: RequestQueueBank # mutex-guarded MPSC ingress from foreign threads
|
||||
reqSignal: ThreadSignalPtr # wakes the FFI thread on enqueue
|
||||
stopSignal: ThreadSignalPtr
|
||||
threadExitSignal: ThreadSignalPtr
|
||||
# bounds destroyFFIContext's wait so a blocked loop cannot hang the caller
|
||||
@ -60,9 +59,10 @@ template closeAndNil(field: untyped) =
|
||||
field = nil
|
||||
|
||||
proc deinitContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
## Mirror of `initContextResources`. Threads MUST be joined first;
|
||||
## fields are nil'd after close so re-init on the same slot is safe.
|
||||
ctx.lock.deinitLock()
|
||||
## Mirror of `initContextResources`. Threads MUST be joined first (FFI thread
|
||||
## drained); fields are nil'd after close so re-init on the same slot is safe.
|
||||
## `deinitRequestQueue` frees any request raced in after the final drain.
|
||||
deinitRequestQueue(ctx[].reqQueueBank)
|
||||
deinitEventRegistry(ctx[].eventRegistry)
|
||||
deinitHandleRegistry(ctx[].handles)
|
||||
deinitEventQueue(ctx[].eventQueue)
|
||||
@ -74,7 +74,6 @@ proc deinitContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
discard
|
||||
else:
|
||||
closeAndNil(ctx.reqSignal)
|
||||
closeAndNil(ctx.reqReceivedSignal)
|
||||
closeAndNil(ctx.stopSignal)
|
||||
closeAndNil(ctx.threadExitSignal)
|
||||
closeAndNil(ctx.eventQueueSignal)
|
||||
@ -96,12 +95,11 @@ proc initContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
## the slot (freeShared or pool.releaseSlot).
|
||||
# Nil first so deferred cleanup can't double-close a reused pool slot.
|
||||
ctx.reqSignal = nil
|
||||
ctx.reqReceivedSignal = nil
|
||||
ctx.stopSignal = nil
|
||||
ctx.threadExitSignal = nil
|
||||
ctx.eventQueueSignal = nil
|
||||
ctx.eventThreadExitSignal = nil
|
||||
ctx.lock.initLock()
|
||||
initRequestQueue(ctx[].reqQueueBank)
|
||||
initEventRegistry(ctx[].eventRegistry)
|
||||
initHandleRegistry(ctx[].handles)
|
||||
initEventQueue(ctx[].eventQueue)
|
||||
@ -116,7 +114,6 @@ proc initContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
error = error
|
||||
|
||||
newSignalOrErr(ctx.reqSignal, "reqSignal")
|
||||
newSignalOrErr(ctx.reqReceivedSignal, "reqReceivedSignal")
|
||||
newSignalOrErr(ctx.stopSignal, "stopSignal")
|
||||
newSignalOrErr(ctx.threadExitSignal, "threadExitSignal")
|
||||
newSignalOrErr(ctx.eventQueueSignal, "eventQueueSignal")
|
||||
|
||||
109
ffi/ffi_request_queue.nim
Normal file
109
ffi/ffi_request_queue.nim
Normal file
@ -0,0 +1,109 @@
|
||||
## Sharded, mutex-guarded MPSC ingress for `ptr FFIThreadRequest`: foreign
|
||||
## threads enqueue without serialising against each other.
|
||||
##
|
||||
## Why sharded: one shared queue funnels all producers through a single cache
|
||||
## line, capping submit throughput. N independent queues (one per producer)
|
||||
## remove that hotspot — producers contend only when two pick the same queue.
|
||||
##
|
||||
## Each queue is an intrusive FIFO under its own `Lock`: race-free under TSAN, and
|
||||
## the request is its own node (intrusive `next`), so enqueue never allocates nor
|
||||
## touches a Nim GC heap (the cross-thread `MemRegion` hazard).
|
||||
##
|
||||
## FIFO holds per queue, not globally. Unbounded by design: submit never blocks
|
||||
## or rejects; completion comes via each request's callback.
|
||||
|
||||
import std/[atomics, locks]
|
||||
import ./ffi_thread_request
|
||||
|
||||
const
|
||||
RequestQueueCount* = 16
|
||||
## Independent ingress queues. ≥ the expected concurrent producer count keeps
|
||||
## queue collisions (hence lock contention) near zero.
|
||||
QueuePadBytes = 192
|
||||
## Pads each queue well past a cache line (128B on Apple silicon) so adjacent
|
||||
## queues' hot fields never false-share — false sharing would re-serialise
|
||||
## exactly what the sharding is meant to spread out.
|
||||
|
||||
static:
|
||||
# `myQueueIndex` maps threads to queues with an `and` mask, so the count must
|
||||
# be a power of two — otherwise the distribution silently skews onto a subset.
|
||||
doAssert (RequestQueueCount and (RequestQueueCount - 1)) == 0,
|
||||
"RequestQueueCount must be a power of two"
|
||||
|
||||
type
|
||||
RequestQueue = object
|
||||
lock: Lock
|
||||
head: ptr FFIThreadRequest ## consumer pops here (oldest)
|
||||
tail: ptr FFIThreadRequest ## producers on this queue append here (newest)
|
||||
pad: array[QueuePadBytes, byte]
|
||||
|
||||
RequestQueueBank* = object
|
||||
queues: array[RequestQueueCount, RequestQueue]
|
||||
|
||||
var gRequestQueue {.threadvar.}: int
|
||||
var gRequestQueueAssigned {.threadvar.}: bool
|
||||
var gRequestQueueCounter: Atomic[int]
|
||||
## Hands each producer thread a distinct queue round-robin on first use, so
|
||||
## queues fill evenly regardless of OS thread-id distribution.
|
||||
|
||||
proc myQueueIndex(): int {.raises: [].} =
|
||||
if not gRequestQueueAssigned:
|
||||
gRequestQueue = gRequestQueueCounter.fetchAdd(1)
|
||||
gRequestQueueAssigned = true
|
||||
return gRequestQueue and (RequestQueueCount - 1) # RequestQueueCount is a power of two
|
||||
|
||||
proc initRequestQueue*(bank: var RequestQueueBank) {.raises: [].} =
|
||||
for queue in bank.queues.mitems:
|
||||
queue.lock.initLock()
|
||||
queue.head = nil
|
||||
queue.tail = nil
|
||||
|
||||
proc deinitRequestQueue*(bank: var RequestQueueBank) {.raises: [].} =
|
||||
## Both producers and the consumer must have stopped. Frees any request still
|
||||
## queued on any queue — e.g. one a producer raced in after the FFI thread's
|
||||
## final drain — so a teardown race leaks nothing instead of dangling them.
|
||||
for queue in bank.queues.mitems:
|
||||
var request = queue.head
|
||||
while not request.isNil():
|
||||
let nextRequest = request[].next
|
||||
deleteRequest(request)
|
||||
request = nextRequest
|
||||
queue.head = nil
|
||||
queue.tail = nil
|
||||
queue.lock.deinitLock()
|
||||
|
||||
proc pushRequest*(
|
||||
bank: var RequestQueueBank, request: ptr FFIThreadRequest
|
||||
): bool {.raises: [].} =
|
||||
## Append `request` to this producer thread's queue (takes ownership). Returns
|
||||
## true only when the queue was empty: the consumer sleeps on an empty queue, so
|
||||
## that's the one push that must wake it; a missed wake just waits the 100ms poll.
|
||||
request[].next = nil
|
||||
let idx = myQueueIndex()
|
||||
withLock bank.queues[idx].lock:
|
||||
let wasEmpty = bank.queues[idx].tail.isNil()
|
||||
if bank.queues[idx].tail.isNil():
|
||||
bank.queues[idx].head = request
|
||||
else:
|
||||
bank.queues[idx].tail[].next = request
|
||||
bank.queues[idx].tail = request
|
||||
return wasEmpty
|
||||
|
||||
proc mergeQueues*(bank: var RequestQueueBank): ptr FFIThreadRequest {.raises: [].} =
|
||||
## Single-consumer: splice every queue into one chain, resetting them to empty.
|
||||
## Returns nil when all are empty; the caller then owns the chain and must read
|
||||
## each request's `next` before dispatching (dispatch frees the request).
|
||||
var head: ptr FFIThreadRequest = nil
|
||||
var tail: ptr FFIThreadRequest = nil
|
||||
for queue in bank.queues.mitems:
|
||||
withLock queue.lock:
|
||||
let h = queue.head
|
||||
if not h.isNil():
|
||||
if head.isNil():
|
||||
head = h
|
||||
else:
|
||||
tail[].next = h
|
||||
tail = queue.tail
|
||||
queue.head = nil
|
||||
queue.tail = nil
|
||||
return head
|
||||
@ -4,50 +4,44 @@
|
||||
## and the `onFFIThread` threadvar. Companion to `event_thread.nim`.
|
||||
##
|
||||
## Responsibilities:
|
||||
## - Receive `FFIThreadRequest`s from foreign threads via `reqChannel` and
|
||||
## dispatch them through the user-registered handler table.
|
||||
## - Receive `FFIThreadRequest`s from foreign threads via `reqQueueBank` (a
|
||||
## mutex-guarded MPSC queue) and dispatch them through the user-registered
|
||||
## handler table.
|
||||
## - Advance `ctx.ffiHeartbeat` each loop iteration so the event thread can
|
||||
## detect a wedged FFI thread.
|
||||
|
||||
proc sendRequestToFFIThread*(
|
||||
ctx: ptr FFIContext, ffiRequest: ptr FFIThreadRequest, timeout = InfiniteDuration
|
||||
ctx: ptr FFIContext, ffiRequest: ptr FFIThreadRequest
|
||||
): Result[void, string] =
|
||||
if ctx.eventQueueStuck.load():
|
||||
deleteRequest(ffiRequest)
|
||||
return err("event queue stuck - library cannot accept new requests")
|
||||
|
||||
if onFFIThread:
|
||||
# Re-entrant dispatch from a handler would self-deadlock on `reqReceivedSignal`.
|
||||
# A handler re-dispatching onto its own FFI thread would enqueue work the
|
||||
# blocked dispatcher can never drain; reject instead of dead-locking.
|
||||
deleteRequest(ffiRequest)
|
||||
return err(
|
||||
"reentrant ffi call: a handler invoked sendRequestToFFIThread on its own context"
|
||||
)
|
||||
|
||||
# Serialise the trySend + fireSync + waitSync — reqChannel is SP and reqReceivedSignal is shared.
|
||||
ctx.lock.acquire()
|
||||
defer:
|
||||
ctx.lock.release()
|
||||
# The lock inside pushRequest covers only the O(1) enqueue; the wake stays
|
||||
# outside it, so concurrent producers don't serialise. Unbounded, so enqueue
|
||||
# can't fail — completion comes via the request's own callback, no accept-ack.
|
||||
#
|
||||
# Wake only when the push found the queue empty: while the consumer drains, a
|
||||
# fireSync() syscall per submit (contended across producers) is what destroys
|
||||
# scaling. A skipped wake can't strand the request — the consumer re-polls 100ms.
|
||||
let shouldWake = ctx.reqQueueBank.pushRequest(ffiRequest)
|
||||
|
||||
let sentOk = ctx.reqChannel.trySend(ffiRequest)
|
||||
if not sentOk:
|
||||
deleteRequest(ffiRequest)
|
||||
return err("Couldn't send a request to the ffi thread")
|
||||
# A failed wake is non-fatal: the request is queued and the poll-drain
|
||||
# dispatches it within a tick anyway. Returning err would double-fire the
|
||||
# caller's callback for a request that still completes.
|
||||
if shouldWake:
|
||||
ctx.reqSignal.fireSync().isOkOr:
|
||||
error "failed to wake FFI thread after enqueue (request still queued)",
|
||||
error = error
|
||||
|
||||
let fireSyncRes = ctx.reqSignal.fireSync()
|
||||
if fireSyncRes.isErr():
|
||||
deleteRequest(ffiRequest)
|
||||
return err("failed fireSync: " & $fireSyncRes.error)
|
||||
|
||||
if fireSyncRes.get() == false:
|
||||
deleteRequest(ffiRequest)
|
||||
return err("Couldn't fireSync in time")
|
||||
|
||||
let res = ctx.reqReceivedSignal.waitSync(timeout)
|
||||
if res.isErr():
|
||||
# FFI thread was signaled and owns the request; don't double-free.
|
||||
return err("Couldn't receive reqReceivedSignal signal")
|
||||
|
||||
# On ok the FFI thread's processRequest deallocShared(req)'s.
|
||||
ok()
|
||||
|
||||
proc processRequest[T](
|
||||
@ -88,6 +82,12 @@ proc ffiNotifyEventEnqueuedHook() {.gcsafe, raises: [].} =
|
||||
if res.isErr():
|
||||
error "failed to fire eventQueueSignal after enqueue", err = res.error
|
||||
|
||||
proc proveAlive(ctx: ptr FFIContext) =
|
||||
## Advance the heartbeat the event thread polls to spot a wedged FFI thread.
|
||||
## Only that the counter keeps moving matters, never its value — so a plain
|
||||
## atomic increment, no read-back.
|
||||
ctx.ffiHeartbeat.atomicInc()
|
||||
|
||||
proc ffiThreadBody[T](ctx: ptr FFIContext[T]) {.thread.} =
|
||||
## FFI thread body that attends library user API requests
|
||||
ffiCurrentEventRegistry = addr ctx[].eventRegistry
|
||||
@ -114,7 +114,7 @@ proc ffiThreadBody[T](ctx: ptr FFIContext[T]) {.thread.} =
|
||||
# Tracked so shutdown can drain them; abandoning a mid-await future leaks the request.
|
||||
var pending: seq[Future[void]] = @[]
|
||||
|
||||
proc reapCompleted() =
|
||||
proc cleanFinishedRequests() =
|
||||
var i = 0
|
||||
while i < pending.len:
|
||||
if not pending[i].finished():
|
||||
@ -122,31 +122,41 @@ proc ffiThreadBody[T](ctx: ptr FFIContext[T]) {.thread.} =
|
||||
continue
|
||||
pending.del(i)
|
||||
|
||||
proc processQueue() =
|
||||
## Process enqueued requests until the queue is empty. A single wake can
|
||||
## stand for many submits, so we drain fully rather than once per wake —
|
||||
## otherwise queued requests would sit until the next wake.
|
||||
while true:
|
||||
var request = ctx.reqQueueBank.mergeQueues()
|
||||
if request.isNil():
|
||||
break
|
||||
while not request.isNil():
|
||||
let nextRequest = request[].next # read before processRequest frees it
|
||||
# Tick per dispatch so a large backlog can't flatline the heartbeat
|
||||
# and trip the event thread's wedged-FFI-thread detection mid-drain.
|
||||
ctx.proveAlive()
|
||||
if ctx.myLib.isNil():
|
||||
# This reference must stay inside the closure: it's what keeps
|
||||
# `ffiReqHandler` in the async env, so `myLib` survives across awaits.
|
||||
ctx.myLib = addr ffiReqHandler
|
||||
|
||||
pending.add processRequest(request, ctx)
|
||||
request = nextRequest
|
||||
|
||||
while ctx.running.load():
|
||||
# Freezes if a sync handler blocks the dispatcher; event thread reads to detect wedged FFI thread.
|
||||
discard ctx.ffiHeartbeat.fetchAdd(1)
|
||||
ctx.proveAlive()
|
||||
|
||||
reapCompleted()
|
||||
cleanFinishedRequests()
|
||||
|
||||
let gotSignal = await ctx.reqSignal.wait().withTimeout(chronos.milliseconds(100))
|
||||
if not gotSignal:
|
||||
continue
|
||||
# Block until a submit signals us, or for at most 100ms if none does.
|
||||
discard await ctx.reqSignal.wait().withTimeout(chronos.milliseconds(100))
|
||||
processQueue()
|
||||
|
||||
var request: ptr FFIThreadRequest
|
||||
if not ctx.reqChannel.tryRecv(request):
|
||||
continue
|
||||
|
||||
if ctx.myLib.isNil():
|
||||
ctx.myLib = addr ffiReqHandler
|
||||
|
||||
pending.add processRequest(request, ctx)
|
||||
|
||||
let fireRes = ctx.reqReceivedSignal.fireSync()
|
||||
if fireRes.isErr():
|
||||
error "could not fireSync back to requester thread", error = fireRes.error
|
||||
|
||||
# Drain so each pending handler's deleteRequest defer runs before exit.
|
||||
reapCompleted()
|
||||
# Drain once more so requests enqueued just before `running` flipped still
|
||||
# dispatch and each pending handler's deleteRequest defer runs before exit.
|
||||
processQueue()
|
||||
cleanFinishedRequests()
|
||||
if pending.len > 0:
|
||||
try:
|
||||
await allFutures(pending)
|
||||
|
||||
@ -26,6 +26,10 @@ type FFIThreadRequest* = object
|
||||
reqId*: cstring ## Per-proc Req type name used to look up the handler.
|
||||
data*: ptr UncheckedArray[byte] ## Owned CBOR-encoded request payload.
|
||||
dataLen*: int
|
||||
next*: ptr FFIThreadRequest
|
||||
## Intrusive ingress-queue link (see `ffi_request_queue.nim`). Touched only
|
||||
## under the queue's lock; the request doubles as its own node, so no
|
||||
## separate node alloc lands on the per-thread ORC MemRegion.
|
||||
|
||||
proc allocBaseRequest(
|
||||
callback: FFICallBack, userData: pointer, reqId: cstring
|
||||
@ -39,6 +43,7 @@ proc allocBaseRequest(
|
||||
ret[].reqId = reqId.alloc()
|
||||
ret[].data = nil
|
||||
ret[].dataLen = 0
|
||||
ret[].next = nil
|
||||
return ret
|
||||
|
||||
proc copySharedPayload(req: ptr FFIThreadRequest, data: ptr byte, dataLen: int) =
|
||||
|
||||
@ -857,12 +857,9 @@ macro ffi*(args: varargs[untyped]): untyped =
|
||||
)
|
||||
|
||||
proc asyncPath(): NimNode =
|
||||
## Emits the C-exported wrapper and registers the request handler.
|
||||
## All `.ffi.` procs dispatch through the FFI thread channel and reply
|
||||
## through the callback when the future resolves — the previous "sync
|
||||
## fast-path" that ran inline on the foreign caller thread was removed
|
||||
## (PR #23 review, items 1–5) because it bypassed `foreignThreadGc`,
|
||||
## `ctx.lock`, and chronos's single-thread invariant.
|
||||
## Emits the C-exported wrapper and registers the handler. Every `.ffi.` proc
|
||||
## dispatches through the FFI thread and replies via its callback, honouring
|
||||
## `foreignThreadGc`, the MPSC ingress hand-off, and chronos's invariant.
|
||||
let helperProc = buildAsyncHelperProc()
|
||||
|
||||
# registerReqFFI lambda: typed params, returns user's typed Result.
|
||||
|
||||
@ -9,7 +9,13 @@ This directory holds Nim micro/stress benchmarks. Neither is part of `nimble tes
|
||||
|
||||
`bench_ffi_submit.nim` motivates [issue #90](https://github.com/logos-messaging/nim-ffi/issues/90): every foreign-thread call serialises the whole `trySend + reqSignal.fireSync + reqReceivedSignal.waitSync` cycle under a single `ctx.lock`. The lock is load-bearing because `reqChannel` is single-slot and the accept handshake waits on a *shared* `reqReceivedSignal`, so producers cannot overlap.
|
||||
|
||||
The bench fans **K producer threads (1 → 8)** at one context, each firing the same per-thread volume of no-op requests. It times the **submit phase only** — from the start gate until every producer returns from its last `sendRequestToFFIThread` — because that is the path the fix parallelises; completion is bounded by the single FFI thread and deliberately excluded. Each thread count runs `FFI_SUBMIT_ITERS` times (default 5) and the **median** submit/sec is reported, so run-to-run noise can't move the verdict.
|
||||
The bench fans **K producer threads** at one context (default sweep `1,2,4,8`), each firing the same per-thread volume of no-op requests. It times the **submit phase only** — from the start gate until every producer returns from its last `sendRequestToFFIThread` — because that is the path the fix parallelises; completion is bounded by the single FFI thread and deliberately excluded. Each thread count runs `FFI_SUBMIT_ITERS` times (default 5) and the **median** submit/sec is reported, so run-to-run noise can't move the verdict.
|
||||
|
||||
The high-contention curve (up to 100 producers) is **opt-in for local runs**, not CI: under a sanitizer on a slow runner, 100 threads can't settle their callbacks within the bench's timeout and would fail on time, not on a bug. Run it on demand with `FFI_SUBMIT_THREADS`:
|
||||
|
||||
```sh
|
||||
FFI_SUBMIT_THREADS="1,8,16,32,64,100" nimble bench_ffi_submit
|
||||
```
|
||||
|
||||
It is also a correctness stress test: the aggregate callback count must match the submit count **exactly** (no drops or double-fires), with zero submit errors and (under asan/lsan/tsan) zero leaks or races.
|
||||
|
||||
@ -21,13 +27,26 @@ FFI_SUBMIT_PER_THREAD=2000 FFI_SUBMIT_ITERS=1 FFI_SCALING_GATE=0 nimble bench_ff
|
||||
NIM_FFI_SAN=tsan FFI_SUBMIT_PER_THREAD=2000 FFI_SCALING_GATE=0 nimble bench_ffi_submit
|
||||
```
|
||||
|
||||
Env knobs: `FFI_SUBMIT_PER_THREAD` (volume per producer, default 20000), `FFI_SUBMIT_ITERS` (median sample count, default 5), `FFI_SCALING_GATE` (default `1`; set `0` to report numbers without failing).
|
||||
Env knobs: `FFI_SUBMIT_PER_THREAD` (volume per producer, default 20000), `FFI_SUBMIT_ITERS` (median sample count, default 5), `FFI_SUBMIT_THREADS` (comma-separated producer counts, default `1,2,4,8`), `FFI_SCALING_GATE` (default `1`; set `0` to report numbers without failing).
|
||||
|
||||
### Scaling gate — red until the lock is replaced
|
||||
### Scaling gate — guards the sharded ingress against regression
|
||||
|
||||
By default the bench **fails** (non-zero exit) unless submit throughput at 8 threads is at least `1.5x` the 1-thread rate. This is a forcing function: it cannot pass while `sendRequestToFFIThread` holds `ctx.lock` across the synchronous `reqReceivedSignal` accept, because that serialises every submit no matter how many producers run.
|
||||
By default the bench **fails** (non-zero exit) unless submit throughput at the top thread count is at least `1.5x` the 1-thread rate. This was a forcing function while `sendRequestToFFIThread` still held `ctx.lock` across the synchronous `reqReceivedSignal` accept (which serialised every submit); it is now a regression guard for the sharded ingress that replaced it.
|
||||
|
||||
Baseline measured 2026-06-24 (16-core Linux, orc, `-d:danger`, median of 5): submit scaling held at **0.98–1.16x** across threads — flat, as the lock dictates. `1.5x` sits above that noise ceiling (so the lock-bound code fails reliably) and well below the `>=2x` that parallel lock-free MPSC ingress yields on any multicore host (so the fix clears it with margin). Once it lands and this turns green, keep the gate as a regression guard.
|
||||
The original lock-bound code measured 2026-06-24 (16-core Linux, orc, `-d:danger`, median of 5): submit scaling held flat at **0.98–1.16x** — adding producers bought nothing. `1.5x` sits above that noise ceiling so the old code fails reliably, and well below what the sharded ingress delivers, so the fix clears it with wide margin.
|
||||
|
||||
The fix replaces the single-slot channel + accept handshake with a **sharded, mutex-guarded MPSC ingress** (`ffi/ffi_request_queue.nim`): independent per-producer queues remove the single shared hotspot, and the wake fires only on a queue's empty→non-empty edge so submits don't each pay a syscall. Measured 2026-06-25 (Apple M5, 10 cores, orc, `-d:danger`, median of 5), submit/sec and scaling vs 1 thread:
|
||||
|
||||
| threads | sharded ingress | vs 1T | lock-free MPSC (Vyukov) | vs 1T |
|
||||
| ---: | ---: | :---: | ---: | :---: |
|
||||
| 1 | 2.14M | 1.00x | 1.18M | 1.00x |
|
||||
| 8 | 17.6M | 8.22x | 0.23M | 0.20x |
|
||||
| 16 | 20.3M | 9.52x | 0.13M | 0.11x |
|
||||
| 32 | 23.4M | 10.96x | 0.115M | 0.10x |
|
||||
| 64 | 24.5M | 11.44x | 0.113M | 0.10x |
|
||||
| 100 | 23.8M | **11.13x** | 0.113M | **0.10x** |
|
||||
|
||||
The sharded ingress tracks the hardware ceiling (~24M/s plateau, saturating the cores) and degrades gracefully past it; the single-hotspot lock-free queue collapses to a contention floor and gets *worse* with more threads. Both are correct at every level (callback count matches submits exactly, no drops/dupes).
|
||||
|
||||
The gate runs in the non-sanitized **Submit Scaling Gate** CI job (`.github/workflows/ci.yml`); the sanitized jobs run the same bench with `FFI_SCALING_GATE=0` for leak/race coverage only, since sanitizer instrumentation makes throughput scaling meaningless.
|
||||
|
||||
|
||||
@ -119,7 +119,19 @@ proc main() =
|
||||
let gateOn = getEnv("FFI_SCALING_GATE", "1") != "0"
|
||||
if perThread < 1 or iters < 1:
|
||||
quit("FFI_SUBMIT_PER_THREAD and FFI_SUBMIT_ITERS must be >= 1")
|
||||
let threadCounts = [1, 2, 4, 8]
|
||||
# Default sweep is light so CI (and the slower asan/tsan jobs) stays fast. Set
|
||||
# FFI_SUBMIT_THREADS for the high-contention curve locally — under a sanitizer
|
||||
# it can outrun `settleTimeout` and fail on timing, not a real bug.
|
||||
# FFI_SUBMIT_THREADS="1,8,16,32,64,100" nimble bench_ffi_submit
|
||||
let threadCounts = block:
|
||||
var cs: seq[int]
|
||||
for part in getEnv("FFI_SUBMIT_THREADS", "1,2,4,8").split(','):
|
||||
let p = part.strip()
|
||||
if p.len > 0:
|
||||
cs.add(parseInt(p))
|
||||
if cs.len < 2:
|
||||
quit("FFI_SUBMIT_THREADS needs >= 2 counts (first = baseline, last = peak)")
|
||||
cs
|
||||
|
||||
echo "── sendRequestToFFIThread submit throughput (median of ",
|
||||
iters, ") ──────"
|
||||
|
||||
@ -487,12 +487,9 @@ suite "Nim-native .ffi. / .ffiCtor. API":
|
||||
check ctorRes.isOk
|
||||
check ctorRes.value.value == 21
|
||||
|
||||
# Regression for PR #23 review items 1–5: a `.ffi.` body without `await`
|
||||
# used to be emitted as an inline-on-foreign-thread fast path, which bypassed
|
||||
# `foreignThreadGc`, `ctx.lock`, and chronos's single-thread invariant. The
|
||||
# sync fast-path was deleted; this test records `getThreadId()` inside a
|
||||
# sync body and asserts the handler runs on the FFI thread, not on the
|
||||
# caller's thread.
|
||||
# A sync `.ffi.` body (no `await`) must run on the FFI thread, not the caller's,
|
||||
# so it goes through `foreignThreadGc`, the MPSC ingress hand-off, and chronos's
|
||||
# single-thread invariant. Records `getThreadId()` in a sync body and asserts it.
|
||||
|
||||
var gRecordedHandlerTid: Atomic[int]
|
||||
|
||||
@ -551,11 +548,9 @@ suite "sync-body .ffi. runs on FFI thread (PR #23 regression)":
|
||||
# And the callback payload (the recorded tid) matches what the handler stored.
|
||||
check cborDecode(callbackBytes(d), int).value == handlerTid
|
||||
|
||||
# Regression for PR #23 review item 6: reentrancy guard on
|
||||
# sendRequestToFFIThread. A handler running on the FFI thread that tries to
|
||||
# dispatch back through sendRequestToFFIThread used to self-deadlock waiting
|
||||
# on `reqReceivedSignal` (which only the FFI thread can fire). The guard now
|
||||
# returns an Err immediately.
|
||||
# Reentrancy guard on sendRequestToFFIThread: a handler running on the FFI thread
|
||||
# that re-dispatches through it would enqueue work onto the very queue its blocked
|
||||
# dispatcher can never drain, so the guard returns an Err immediately.
|
||||
|
||||
var gReentrantNestedRes: Channel[string]
|
||||
gReentrantNestedRes.open()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user