feat(ffi): sharded MPSC request ingress (alternative to lock-free #98) (#101)

Co-authored-by: Gabriel Cruz <8129788+gmelodie@users.noreply.github.com>
2026-06-30 13:19:31 +00:00 · 2026-06-27 19:08:10 +02:00 · 2026-06-27 19:08:10 +02:00 · 5d49ee6b08
commit 5d49ee6b08
parent 3d7b267ef0
10 changed files with 243 additions and 99 deletions
--- a/examples/timer/rust_bindings/src/api.rs
+++ b/examples/timer/rust_bindings/src/api.rs
@ -129,12 +129,13 @@ pub struct MyTimerCtx {

 // SAFETY: The `ptr` field points to an FFIContext owned by the Nim runtime.
 // Every call through the generated FFI proc goes through
-// `sendRequestToFFIThread` on the Nim side, which serialises every request
-// behind `ctx.lock` and dispatches handlers on a single FFI thread, so the
-// pointer is never accessed concurrently from Rust. The Nim-side reentrancy
-// guard (`onFFIThread` threadvar) prevents handlers from re-entering the
-// dispatcher and self-deadlocking. These invariants make it sound to mark
-// the wrapper as Send + Sync.
+// `sendRequestToFFIThread` on the Nim side, which only enqueues the request
+// onto a mutex-guarded MPSC queue (sound from any number of threads) and
+// wakes the single FFI thread that dispatches every handler. The context is
+// thus never mutated non-atomically from the caller's thread. The Nim-side
+// reentrancy guard (`onFFIThread` threadvar) prevents handlers from
+// re-entering the dispatcher. These invariants make it sound to mark the
+// wrapper as Send + Sync.
 unsafe impl Send for MyTimerCtx {}
 unsafe impl Sync for MyTimerCtx {}

--- a/ffi/codegen/rust.nim
+++ b/ffi/codegen/rust.nim
@ -505,19 +505,18 @@ proc generateApiRs*(
  )
  lines.add("// Every call through the generated FFI proc goes through")
  lines.add(
-    "// `sendRequestToFFIThread` on the Nim side, which serialises every request"
+    "// `sendRequestToFFIThread` on the Nim side, which only enqueues the request"
+  )
+  lines.add("// onto a mutex-guarded MPSC queue (sound from any number of threads) and")
+  lines.add(
+    "// wakes the single FFI thread that dispatches every handler. The context is"
  )
  lines.add(
-    "// behind `ctx.lock` and dispatches handlers on a single FFI thread, so the"
+    "// thus never mutated non-atomically from the caller's thread. The Nim-side"
  )
-  lines.add(
-    "// pointer is never accessed concurrently from Rust. The Nim-side reentrancy"
-  )
-  lines.add("// guard (`onFFIThread` threadvar) prevents handlers from re-entering the")
-  lines.add(
-    "// dispatcher and self-deadlocking. These invariants make it sound to mark"
-  )
-  lines.add("// the wrapper as Send + Sync.")
+  lines.add("// reentrancy guard (`onFFIThread` threadvar) prevents handlers from")
+  lines.add("// re-entering the dispatcher. These invariants make it sound to mark the")
+  lines.add("// wrapper as Send + Sync.")
  lines.add("unsafe impl Send for $1 {}" % [ctxTypeName])
  lines.add("unsafe impl Sync for $1 {}" % [ctxTypeName])
  lines.add("")
--- a/ffi/ffi_context.nim
+++ b/ffi/ffi_context.nim
@ -7,12 +7,13 @@
 {.passc: "-fPIC".}

 import std/[atomics, locks, options, tables]
-import chronicles, chronos, chronos/threadsync, taskpools/channels_spsc_single, results
+import chronicles, chronos, chronos/threadsync, results
 import
  ./ffi_types,
  ./ffi_events,
  ./ffi_handles,
  ./ffi_thread_request,
+  ./ffi_request_queue,
  ./logging,
  ./cbor_serial

@ -22,10 +23,8 @@ type FFIContext*[T] = object
  myLib*: ptr T # main library object (Waku, LibP2P, SDS, …)
  ffiThread: Thread[(ptr FFIContext[T])]
  eventThread: Thread[(ptr FFIContext[T])]
-  lock: Lock
-  reqChannel: ChannelSPSCSingle[ptr FFIThreadRequest]
-  reqSignal: ThreadSignalPtr
-  reqReceivedSignal: ThreadSignalPtr
+  reqQueueBank: RequestQueueBank # mutex-guarded MPSC ingress from foreign threads
+  reqSignal: ThreadSignalPtr # wakes the FFI thread on enqueue
  stopSignal: ThreadSignalPtr
  threadExitSignal: ThreadSignalPtr
    # bounds destroyFFIContext's wait so a blocked loop cannot hang the caller
@ -60,9 +59,10 @@ template closeAndNil(field: untyped) =
    field = nil

 proc deinitContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
-  ## Mirror of `initContextResources`. Threads MUST be joined first;
-  ## fields are nil'd after close so re-init on the same slot is safe.
-  ctx.lock.deinitLock()
+  ## Mirror of `initContextResources`. Threads MUST be joined first (FFI thread
+  ## drained); fields are nil'd after close so re-init on the same slot is safe.
+  ## `deinitRequestQueue` frees any request raced in after the final drain.
+  deinitRequestQueue(ctx[].reqQueueBank)
  deinitEventRegistry(ctx[].eventRegistry)
  deinitHandleRegistry(ctx[].handles)
  deinitEventQueue(ctx[].eventQueue)
@ -74,7 +74,6 @@ proc deinitContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
    discard
  else:
    closeAndNil(ctx.reqSignal)
-    closeAndNil(ctx.reqReceivedSignal)
    closeAndNil(ctx.stopSignal)
    closeAndNil(ctx.threadExitSignal)
    closeAndNil(ctx.eventQueueSignal)
@ -96,12 +95,11 @@ proc initContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
  ## the slot (freeShared or pool.releaseSlot).
  # Nil first so deferred cleanup can't double-close a reused pool slot.
  ctx.reqSignal = nil
-  ctx.reqReceivedSignal = nil
  ctx.stopSignal = nil
  ctx.threadExitSignal = nil
  ctx.eventQueueSignal = nil
  ctx.eventThreadExitSignal = nil
-  ctx.lock.initLock()
+  initRequestQueue(ctx[].reqQueueBank)
  initEventRegistry(ctx[].eventRegistry)
  initHandleRegistry(ctx[].handles)
  initEventQueue(ctx[].eventQueue)
@ -116,7 +114,6 @@ proc initContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
          error = error

  newSignalOrErr(ctx.reqSignal, "reqSignal")
-  newSignalOrErr(ctx.reqReceivedSignal, "reqReceivedSignal")
  newSignalOrErr(ctx.stopSignal, "stopSignal")
  newSignalOrErr(ctx.threadExitSignal, "threadExitSignal")
  newSignalOrErr(ctx.eventQueueSignal, "eventQueueSignal")
--- a/ffi/ffi_request_queue.nim
+++ b/ffi/ffi_request_queue.nim
@ -0,0 +1,109 @@
+## Sharded, mutex-guarded MPSC ingress for `ptr FFIThreadRequest`: foreign
+## threads enqueue without serialising against each other.
+##
+## Why sharded: one shared queue funnels all producers through a single cache
+## line, capping submit throughput. N independent queues (one per producer)
+## remove that hotspot — producers contend only when two pick the same queue.
+##
+## Each queue is an intrusive FIFO under its own `Lock`: race-free under TSAN, and
+## the request is its own node (intrusive `next`), so enqueue never allocates nor
+## touches a Nim GC heap (the cross-thread `MemRegion` hazard).
+##
+## FIFO holds per queue, not globally. Unbounded by design: submit never blocks
+## or rejects; completion comes via each request's callback.
+
+import std/[atomics, locks]
+import ./ffi_thread_request
+
+const
+  RequestQueueCount* = 16
+    ## Independent ingress queues. ≥ the expected concurrent producer count keeps
+    ## queue collisions (hence lock contention) near zero.
+  QueuePadBytes = 192
+    ## Pads each queue well past a cache line (128B on Apple silicon) so adjacent
+    ## queues' hot fields never false-share — false sharing would re-serialise
+    ## exactly what the sharding is meant to spread out.
+
+static:
+  # `myQueueIndex` maps threads to queues with an `and` mask, so the count must
+  # be a power of two — otherwise the distribution silently skews onto a subset.
+  doAssert (RequestQueueCount and (RequestQueueCount - 1)) == 0,
+    "RequestQueueCount must be a power of two"
+
+type
+  RequestQueue = object
+    lock: Lock
+    head: ptr FFIThreadRequest ## consumer pops here (oldest)
+    tail: ptr FFIThreadRequest ## producers on this queue append here (newest)
+    pad: array[QueuePadBytes, byte]
+
+  RequestQueueBank* = object
+    queues: array[RequestQueueCount, RequestQueue]
+
+var gRequestQueue {.threadvar.}: int
+var gRequestQueueAssigned {.threadvar.}: bool
+var gRequestQueueCounter: Atomic[int]
+  ## Hands each producer thread a distinct queue round-robin on first use, so
+  ## queues fill evenly regardless of OS thread-id distribution.
+
+proc myQueueIndex(): int {.raises: [].} =
+  if not gRequestQueueAssigned:
+    gRequestQueue = gRequestQueueCounter.fetchAdd(1)
+    gRequestQueueAssigned = true
+  return gRequestQueue and (RequestQueueCount - 1) # RequestQueueCount is a power of two
+
+proc initRequestQueue*(bank: var RequestQueueBank) {.raises: [].} =
+  for queue in bank.queues.mitems:
+    queue.lock.initLock()
+    queue.head = nil
+    queue.tail = nil
+
+proc deinitRequestQueue*(bank: var RequestQueueBank) {.raises: [].} =
+  ## Both producers and the consumer must have stopped. Frees any request still
+  ## queued on any queue — e.g. one a producer raced in after the FFI thread's
+  ## final drain — so a teardown race leaks nothing instead of dangling them.
+  for queue in bank.queues.mitems:
+    var request = queue.head
+    while not request.isNil():
+      let nextRequest = request[].next
+      deleteRequest(request)
+      request = nextRequest
+    queue.head = nil
+    queue.tail = nil
+    queue.lock.deinitLock()
+
+proc pushRequest*(
+    bank: var RequestQueueBank, request: ptr FFIThreadRequest
+): bool {.raises: [].} =
+  ## Append `request` to this producer thread's queue (takes ownership). Returns
+  ## true only when the queue was empty: the consumer sleeps on an empty queue, so
+  ## that's the one push that must wake it; a missed wake just waits the 100ms poll.
+  request[].next = nil
+  let idx = myQueueIndex()
+  withLock bank.queues[idx].lock:
+    let wasEmpty = bank.queues[idx].tail.isNil()
+    if bank.queues[idx].tail.isNil():
+      bank.queues[idx].head = request
+    else:
+      bank.queues[idx].tail[].next = request
+    bank.queues[idx].tail = request
+    return wasEmpty
+
+proc mergeQueues*(bank: var RequestQueueBank): ptr FFIThreadRequest {.raises: [].} =
+  ## Single-consumer: splice every queue into one chain, resetting them to empty.
+  ## Returns nil when all are empty; the caller then owns the chain and must read
+  ## each request's `next` before dispatching (dispatch frees the request).
+  var head: ptr FFIThreadRequest = nil
+  var tail: ptr FFIThreadRequest = nil
+  for queue in bank.queues.mitems:
+    withLock queue.lock:
+      let h = queue.head
+      if not h.isNil():
+        if head.isNil():
+          head = h
+        else:
+          tail[].next = h
+        tail = queue.tail
+        queue.head = nil
+        queue.tail = nil
+  return head
--- a/ffi/ffi_thread.nim
+++ b/ffi/ffi_thread.nim
@ -4,50 +4,44 @@
 ## and the `onFFIThread` threadvar. Companion to `event_thread.nim`.
 ##
 ## Responsibilities:
-## - Receive `FFIThreadRequest`s from foreign threads via `reqChannel` and
-##   dispatch them through the user-registered handler table.
+## - Receive `FFIThreadRequest`s from foreign threads via `reqQueueBank` (a
+##   mutex-guarded MPSC queue) and dispatch them through the user-registered
+##   handler table.
 ## - Advance `ctx.ffiHeartbeat` each loop iteration so the event thread can
 ##   detect a wedged FFI thread.

 proc sendRequestToFFIThread*(
-    ctx: ptr FFIContext, ffiRequest: ptr FFIThreadRequest, timeout = InfiniteDuration
+    ctx: ptr FFIContext, ffiRequest: ptr FFIThreadRequest
 ): Result[void, string] =
  if ctx.eventQueueStuck.load():
    deleteRequest(ffiRequest)
    return err("event queue stuck - library cannot accept new requests")

  if onFFIThread:
-    # Re-entrant dispatch from a handler would self-deadlock on `reqReceivedSignal`.
+    # A handler re-dispatching onto its own FFI thread would enqueue work the
+    # blocked dispatcher can never drain; reject instead of dead-locking.
    deleteRequest(ffiRequest)
    return err(
      "reentrant ffi call: a handler invoked sendRequestToFFIThread on its own context"
    )

-  # Serialise the trySend + fireSync + waitSync — reqChannel is SP and reqReceivedSignal is shared.
-  ctx.lock.acquire()
-  defer:
-    ctx.lock.release()
+  # The lock inside pushRequest covers only the O(1) enqueue; the wake stays
+  # outside it, so concurrent producers don't serialise. Unbounded, so enqueue
+  # can't fail — completion comes via the request's own callback, no accept-ack.
+  #
+  # Wake only when the push found the queue empty: while the consumer drains, a
+  # fireSync() syscall per submit (contended across producers) is what destroys
+  # scaling. A skipped wake can't strand the request — the consumer re-polls 100ms.
+  let shouldWake = ctx.reqQueueBank.pushRequest(ffiRequest)

-  let sentOk = ctx.reqChannel.trySend(ffiRequest)
-  if not sentOk:
-    deleteRequest(ffiRequest)
-    return err("Couldn't send a request to the ffi thread")
+  # A failed wake is non-fatal: the request is queued and the poll-drain
+  # dispatches it within a tick anyway. Returning err would double-fire the
+  # caller's callback for a request that still completes.
+  if shouldWake:
+    ctx.reqSignal.fireSync().isOkOr:
+      error "failed to wake FFI thread after enqueue (request still queued)",
+        error = error

-  let fireSyncRes = ctx.reqSignal.fireSync()
-  if fireSyncRes.isErr():
-    deleteRequest(ffiRequest)
-    return err("failed fireSync: " & $fireSyncRes.error)
-
-  if fireSyncRes.get() == false:
-    deleteRequest(ffiRequest)
-    return err("Couldn't fireSync in time")
-
-  let res = ctx.reqReceivedSignal.waitSync(timeout)
-  if res.isErr():
-    # FFI thread was signaled and owns the request; don't double-free.
-    return err("Couldn't receive reqReceivedSignal signal")
-
-  # On ok the FFI thread's processRequest deallocShared(req)'s.
  ok()

 proc processRequest[T](
@ -88,6 +82,12 @@ proc ffiNotifyEventEnqueuedHook() {.gcsafe, raises: [].} =
    if res.isErr():
      error "failed to fire eventQueueSignal after enqueue", err = res.error

+proc proveAlive(ctx: ptr FFIContext) =
+  ## Advance the heartbeat the event thread polls to spot a wedged FFI thread.
+  ## Only that the counter keeps moving matters, never its value — so a plain
+  ## atomic increment, no read-back.
+  ctx.ffiHeartbeat.atomicInc()
+
 proc ffiThreadBody[T](ctx: ptr FFIContext[T]) {.thread.} =
  ## FFI thread body that attends library user API requests
  ffiCurrentEventRegistry = addr ctx[].eventRegistry
@ -114,7 +114,7 @@ proc ffiThreadBody[T](ctx: ptr FFIContext[T]) {.thread.} =
    # Tracked so shutdown can drain them; abandoning a mid-await future leaks the request.
    var pending: seq[Future[void]] = @[]

-    proc reapCompleted() =
+    proc cleanFinishedRequests() =
      var i = 0
      while i < pending.len:
        if not pending[i].finished():
@ -122,31 +122,41 @@ proc ffiThreadBody[T](ctx: ptr FFIContext[T]) {.thread.} =
          continue
        pending.del(i)

+    proc processQueue() =
+      ## Process enqueued requests until the queue is empty. A single wake can
+      ## stand for many submits, so we drain fully rather than once per wake —
+      ## otherwise queued requests would sit until the next wake.
+      while true:
+        var request = ctx.reqQueueBank.mergeQueues()
+        if request.isNil():
+          break
+        while not request.isNil():
+          let nextRequest = request[].next # read before processRequest frees it
+          # Tick per dispatch so a large backlog can't flatline the heartbeat
+          # and trip the event thread's wedged-FFI-thread detection mid-drain.
+          ctx.proveAlive()
+          if ctx.myLib.isNil():
+            # This reference must stay inside the closure: it's what keeps
+            # `ffiReqHandler` in the async env, so `myLib` survives across awaits.
+            ctx.myLib = addr ffiReqHandler
+
+          pending.add processRequest(request, ctx)
+          request = nextRequest
+
    while ctx.running.load():
      # Freezes if a sync handler blocks the dispatcher; event thread reads to detect wedged FFI thread.
-      discard ctx.ffiHeartbeat.fetchAdd(1)
+      ctx.proveAlive()

-      reapCompleted()
+      cleanFinishedRequests()

-      let gotSignal = await ctx.reqSignal.wait().withTimeout(chronos.milliseconds(100))
-      if not gotSignal:
-        continue
+      # Block until a submit signals us, or for at most 100ms if none does.
+      discard await ctx.reqSignal.wait().withTimeout(chronos.milliseconds(100))
+      processQueue()

-      var request: ptr FFIThreadRequest
-      if not ctx.reqChannel.tryRecv(request):
-        continue
-
-      if ctx.myLib.isNil():
-        ctx.myLib = addr ffiReqHandler
-
-      pending.add processRequest(request, ctx)
-
-      let fireRes = ctx.reqReceivedSignal.fireSync()
-      if fireRes.isErr():
-        error "could not fireSync back to requester thread", error = fireRes.error
-
-    # Drain so each pending handler's deleteRequest defer runs before exit.
-    reapCompleted()
+    # Drain once more so requests enqueued just before `running` flipped still
+    # dispatch and each pending handler's deleteRequest defer runs before exit.
+    processQueue()
+    cleanFinishedRequests()
    if pending.len > 0:
      try:
        await allFutures(pending)
--- a/ffi/ffi_thread_request.nim
+++ b/ffi/ffi_thread_request.nim
@ -26,6 +26,10 @@ type FFIThreadRequest* = object
  reqId*: cstring ## Per-proc Req type name used to look up the handler.
  data*: ptr UncheckedArray[byte] ## Owned CBOR-encoded request payload.
  dataLen*: int
+  next*: ptr FFIThreadRequest
+    ## Intrusive ingress-queue link (see `ffi_request_queue.nim`). Touched only
+    ## under the queue's lock; the request doubles as its own node, so no
+    ## separate node alloc lands on the per-thread ORC MemRegion.

 proc allocBaseRequest(
    callback: FFICallBack, userData: pointer, reqId: cstring
@ -39,6 +43,7 @@ proc allocBaseRequest(
  ret[].reqId = reqId.alloc()
  ret[].data = nil
  ret[].dataLen = 0
+  ret[].next = nil
  return ret

 proc copySharedPayload(req: ptr FFIThreadRequest, data: ptr byte, dataLen: int) =
--- a/ffi/internal/ffi_macro.nim
+++ b/ffi/internal/ffi_macro.nim
@ -857,12 +857,9 @@ macro ffi*(args: varargs[untyped]): untyped =
    )

  proc asyncPath(): NimNode =
-    ## Emits the C-exported wrapper and registers the request handler.
-    ## All `.ffi.` procs dispatch through the FFI thread channel and reply
-    ## through the callback when the future resolves — the previous "sync
-    ## fast-path" that ran inline on the foreign caller thread was removed
-    ## (PR #23 review, items 1–5) because it bypassed `foreignThreadGc`,
-    ## `ctx.lock`, and chronos's single-thread invariant.
+    ## Emits the C-exported wrapper and registers the handler. Every `.ffi.` proc
+    ## dispatches through the FFI thread and replies via its callback, honouring
+    ## `foreignThreadGc`, the MPSC ingress hand-off, and chronos's invariant.
    let helperProc = buildAsyncHelperProc()

    # registerReqFFI lambda: typed params, returns user's typed Result.
--- a/tests/bench/README.md
+++ b/tests/bench/README.md
@ -9,7 +9,13 @@ This directory holds Nim micro/stress benchmarks. Neither is part of `nimble tes

 `bench_ffi_submit.nim` motivates [issue #90](https://github.com/logos-messaging/nim-ffi/issues/90): every foreign-thread call serialises the whole `trySend + reqSignal.fireSync + reqReceivedSignal.waitSync` cycle under a single `ctx.lock`. The lock is load-bearing because `reqChannel` is single-slot and the accept handshake waits on a *shared* `reqReceivedSignal`, so producers cannot overlap.

-The bench fans **K producer threads (1 → 8)** at one context, each firing the same per-thread volume of no-op requests. It times the **submit phase only** — from the start gate until every producer returns from its last `sendRequestToFFIThread` — because that is the path the fix parallelises; completion is bounded by the single FFI thread and deliberately excluded. Each thread count runs `FFI_SUBMIT_ITERS` times (default 5) and the **median** submit/sec is reported, so run-to-run noise can't move the verdict.
+The bench fans **K producer threads** at one context (default sweep `1,2,4,8`), each firing the same per-thread volume of no-op requests. It times the **submit phase only** — from the start gate until every producer returns from its last `sendRequestToFFIThread` — because that is the path the fix parallelises; completion is bounded by the single FFI thread and deliberately excluded. Each thread count runs `FFI_SUBMIT_ITERS` times (default 5) and the **median** submit/sec is reported, so run-to-run noise can't move the verdict.
+
+The high-contention curve (up to 100 producers) is **opt-in for local runs**, not CI: under a sanitizer on a slow runner, 100 threads can't settle their callbacks within the bench's timeout and would fail on time, not on a bug. Run it on demand with `FFI_SUBMIT_THREADS`:
+
+```sh
+FFI_SUBMIT_THREADS="1,8,16,32,64,100" nimble bench_ffi_submit
+```

 It is also a correctness stress test: the aggregate callback count must match the submit count **exactly** (no drops or double-fires), with zero submit errors and (under asan/lsan/tsan) zero leaks or races.

@ -21,13 +27,26 @@ FFI_SUBMIT_PER_THREAD=2000 FFI_SUBMIT_ITERS=1 FFI_SCALING_GATE=0 nimble bench_ff
 NIM_FFI_SAN=tsan FFI_SUBMIT_PER_THREAD=2000 FFI_SCALING_GATE=0 nimble bench_ffi_submit
 ```

-Env knobs: `FFI_SUBMIT_PER_THREAD` (volume per producer, default 20000), `FFI_SUBMIT_ITERS` (median sample count, default 5), `FFI_SCALING_GATE` (default `1`; set `0` to report numbers without failing).
+Env knobs: `FFI_SUBMIT_PER_THREAD` (volume per producer, default 20000), `FFI_SUBMIT_ITERS` (median sample count, default 5), `FFI_SUBMIT_THREADS` (comma-separated producer counts, default `1,2,4,8`), `FFI_SCALING_GATE` (default `1`; set `0` to report numbers without failing).

-### Scaling gate — red until the lock is replaced
+### Scaling gate — guards the sharded ingress against regression

-By default the bench **fails** (non-zero exit) unless submit throughput at 8 threads is at least `1.5x` the 1-thread rate. This is a forcing function: it cannot pass while `sendRequestToFFIThread` holds `ctx.lock` across the synchronous `reqReceivedSignal` accept, because that serialises every submit no matter how many producers run.
+By default the bench **fails** (non-zero exit) unless submit throughput at the top thread count is at least `1.5x` the 1-thread rate. This was a forcing function while `sendRequestToFFIThread` still held `ctx.lock` across the synchronous `reqReceivedSignal` accept (which serialised every submit); it is now a regression guard for the sharded ingress that replaced it.

-Baseline measured 2026-06-24 (16-core Linux, orc, `-d:danger`, median of 5): submit scaling held at **0.98–1.16x** across threads — flat, as the lock dictates. `1.5x` sits above that noise ceiling (so the lock-bound code fails reliably) and well below the `>=2x` that parallel lock-free MPSC ingress yields on any multicore host (so the fix clears it with margin). Once it lands and this turns green, keep the gate as a regression guard.
+The original lock-bound code measured 2026-06-24 (16-core Linux, orc, `-d:danger`, median of 5): submit scaling held flat at **0.98–1.16x** — adding producers bought nothing. `1.5x` sits above that noise ceiling so the old code fails reliably, and well below what the sharded ingress delivers, so the fix clears it with wide margin.
+
+The fix replaces the single-slot channel + accept handshake with a **sharded, mutex-guarded MPSC ingress** (`ffi/ffi_request_queue.nim`): independent per-producer queues remove the single shared hotspot, and the wake fires only on a queue's empty→non-empty edge so submits don't each pay a syscall. Measured 2026-06-25 (Apple M5, 10 cores, orc, `-d:danger`, median of 5), submit/sec and scaling vs 1 thread:
+
+| threads | sharded ingress | vs 1T | lock-free MPSC (Vyukov) | vs 1T |
+| ---: | ---: | :---: | ---: | :---: |
+| 1   | 2.14M | 1.00x | 1.18M | 1.00x |
+| 8   | 17.6M | 8.22x | 0.23M | 0.20x |
+| 16  | 20.3M | 9.52x | 0.13M | 0.11x |
+| 32  | 23.4M | 10.96x | 0.115M | 0.10x |
+| 64  | 24.5M | 11.44x | 0.113M | 0.10x |
+| 100 | 23.8M | **11.13x** | 0.113M | **0.10x** |
+
+The sharded ingress tracks the hardware ceiling (~24M/s plateau, saturating the cores) and degrades gracefully past it; the single-hotspot lock-free queue collapses to a contention floor and gets *worse* with more threads. Both are correct at every level (callback count matches submits exactly, no drops/dupes).

 The gate runs in the non-sanitized **Submit Scaling Gate** CI job (`.github/workflows/ci.yml`); the sanitized jobs run the same bench with `FFI_SCALING_GATE=0` for leak/race coverage only, since sanitizer instrumentation makes throughput scaling meaningless.

--- a/tests/bench/bench_ffi_submit.nim
+++ b/tests/bench/bench_ffi_submit.nim
@ -119,7 +119,19 @@ proc main() =
  let gateOn = getEnv("FFI_SCALING_GATE", "1") != "0"
  if perThread < 1 or iters < 1:
    quit("FFI_SUBMIT_PER_THREAD and FFI_SUBMIT_ITERS must be >= 1")
-  let threadCounts = [1, 2, 4, 8]
+  # Default sweep is light so CI (and the slower asan/tsan jobs) stays fast. Set
+  # FFI_SUBMIT_THREADS for the high-contention curve locally — under a sanitizer
+  # it can outrun `settleTimeout` and fail on timing, not a real bug.
+  #   FFI_SUBMIT_THREADS="1,8,16,32,64,100" nimble bench_ffi_submit
+  let threadCounts = block:
+    var cs: seq[int]
+    for part in getEnv("FFI_SUBMIT_THREADS", "1,2,4,8").split(','):
+      let p = part.strip()
+      if p.len > 0:
+        cs.add(parseInt(p))
+    if cs.len < 2:
+      quit("FFI_SUBMIT_THREADS needs >= 2 counts (first = baseline, last = peak)")
+    cs

  echo "── sendRequestToFFIThread submit throughput (median of ",
    iters, ") ──────"
--- a/tests/unit/test_ffi_context.nim
+++ b/tests/unit/test_ffi_context.nim
@ -487,12 +487,9 @@ suite "Nim-native .ffi. / .ffiCtor. API":
    check ctorRes.isOk
    check ctorRes.value.value == 21

-# Regression for PR #23 review items 1–5: a `.ffi.` body without `await`
-# used to be emitted as an inline-on-foreign-thread fast path, which bypassed
-# `foreignThreadGc`, `ctx.lock`, and chronos's single-thread invariant. The
-# sync fast-path was deleted; this test records `getThreadId()` inside a
-# sync body and asserts the handler runs on the FFI thread, not on the
-# caller's thread.
+# A sync `.ffi.` body (no `await`) must run on the FFI thread, not the caller's,
+# so it goes through `foreignThreadGc`, the MPSC ingress hand-off, and chronos's
+# single-thread invariant. Records `getThreadId()` in a sync body and asserts it.

 var gRecordedHandlerTid: Atomic[int]

@ -551,11 +548,9 @@ suite "sync-body .ffi. runs on FFI thread (PR #23 regression)":
    # And the callback payload (the recorded tid) matches what the handler stored.
    check cborDecode(callbackBytes(d), int).value == handlerTid

-# Regression for PR #23 review item 6: reentrancy guard on
-# sendRequestToFFIThread. A handler running on the FFI thread that tries to
-# dispatch back through sendRequestToFFIThread used to self-deadlock waiting
-# on `reqReceivedSignal` (which only the FFI thread can fire). The guard now
-# returns an Err immediately.
+# Reentrancy guard on sendRequestToFFIThread: a handler running on the FFI thread
+# that re-dispatches through it would enqueue work onto the very queue its blocked
+# dispatcher can never drain, so the guard returns an Err immediately.

 var gReentrantNestedRes: Channel[string]
 gReentrantNestedRes.open()