nim-ffi/ffi/ffi_context.nim
Ivan FB 0e176bd5eb
fix(ffi): set up foreign-thread GC in entry procs; recycle/event cleanup
- Call initializeLibrary() (setupForeignThreadGc) in the `.ffi.` request
  wrapper and in add/remove_event_listener so a foreign (Go) caller thread
  has an initialised Nim heap before any allocation ($reqTypeName /
  $eventName / registry ops). Without it such a thread segfaults in the
  allocator under GC pressure — the production unwrap SIGSEGV.
- recycleContext resets the event registry/queue + stuck flag on park so a
  reused pool slot starts clean.
- ffiDtor doc/cleanup for the async recycle ABI.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-12 11:28:48 +02:00

274 lines
11 KiB
Nim

## FFIContext type plus lifecycle (init / signal-stop / join / destroy).
##
## The per-thread bodies live in `ffi_thread.nim` and `event_thread.nim`,
## included below so the thread code can access the private FFIContext
## fields without forcing them through a public surface.
{.passc: "-fPIC".}
# Embedded in a foreign host (Go/Rust/...) the host must own OS signal handling;
# Nim installing its own handlers clobbers it (e.g. Go's SIGSEGV -> sigpanic).
# Enforce -d:noSignalHandler; standalone Nim binaries opt out via -d:ffiAllowSignalHandler.
when not defined(noSignalHandler) and not defined(ffiAllowSignalHandler):
{.
error:
"nim-ffi: missing required compile flag. If this library is embedded in a " &
"host process (Go/Rust/...), build with -d:noSignalHandler so the host keeps " &
"ownership of OS signal handlers (it needs SIGSEGV for crash recovery, stack " &
"growth and preemption). If instead this is a standalone Nim program that owns " &
"its own process, build with -d:ffiAllowSignalHandler."
.}
import std/[atomics, locks, options, sequtils, tables]
import chronicles, chronos, chronos/threadsync, taskpools/channels_spsc_single, results
import ./ffi_types, ./ffi_events, ./ffi_thread_request, ./logging, ./cbor_serial
export ffi_events
type CtxLifecycle {.pure.} = enum
## State machine guarding a pooled FFI context, held as an Atomic on FFIContext.
## The threads, signals and dispatcher kqueues are created once per slot and
## REUSED across acquire/release — chronos never frees a dispatcher's kqueue fd
## (design decision; freed only at process exit), so spawning a thread per
## context would leak fds unboundedly. Recycling parks the context instead.
## Transitions:
## Active -> RecyclePending when the destructor is invoked
## RecyclePending -> Recycling FFI loop drains handlers, frees lib, releases slot
## Recycling -> Active next createFFIContext reuses the slot (markAsActive)
Active ## accepting and serving requests
RecyclePending ## recycle requested; FFI thread loop hasn't claimed it yet
Recycling ## FFI loop draining handlers, then frees lib + returns to pool
type FFIContext*[T] = object
myLib*: ptr T # main library object (Waku, LibP2P, SDS, …)
ffiThread: Thread[(ptr FFIContext[T])]
eventThread: Thread[(ptr FFIContext[T])]
lock: Lock
reqChannel: ChannelSPSCSingle[ptr FFIThreadRequest]
reqSignal: ThreadSignalPtr
reqReceivedSignal: ThreadSignalPtr
stopSignal: ThreadSignalPtr
threadExitSignal: ThreadSignalPtr
# bounds destroyFFIContext's wait so a blocked loop cannot hang the caller
eventQueueSignal: ThreadSignalPtr # wakes the event thread on enqueue
eventThreadExitSignal: ThreadSignalPtr # mirrors threadExitSignal for the event thread
userData*: pointer
eventRegistry*: FFIEventRegistry
eventQueue*: EventQueue
ffiHeartbeat*: Atomic[int64]
# advanced each FFI-thread loop; event thread reads for liveness
eventQueueStuck*: Atomic[bool] # sticky overflow flag
running: Atomic[bool] # To control when the threads are running
lifecycle: Atomic[CtxLifecycle] # Active / RecyclePending / Recycling
recycleCallback: FFICallBack
# destructor's callback, fired by the recycle handler with the outcome:
# RET_OK once drained, RET_ERR if it timed out. Set by requestRecycle.
recycleUserData: pointer
inUse: Atomic[bool]
# whether the slot is claimed; createFFIContext claims it, the recycle
# handler clears it once drained so the owning thread can release without
# reaching into the pool.
registeredRequests: ptr Table[cstring, FFIRequestProc]
var onFFIThread* {.threadvar.}: bool
# Re-entrant dispatch guard for `sendRequestToFFIThread`.
const git_version* {.strdefine.} = "n/a"
const
EventThreadTickInterval* = 1.seconds
FFIHeartbeatStartDelay* = 10.seconds # grace window for library startup
FFIHeartbeatStaleThreshold* = 1.seconds
proc tryClaim*[T](ctx: ptr FFIContext[T]): bool =
## Returns true if the slot was free and is now claimed, false if already in use.
var expected = false
ctx.inUse.compareExchange(expected, true)
proc release*[T](ctx: ptr FFIContext[T]) =
ctx.inUse.store(false)
proc isInUse*[T](ctx: ptr FFIContext[T]): bool =
ctx.inUse.load()
proc markAsActive*[T](ctx: ptr FFIContext[T]) =
## Re-arms a reused (recycled) slot to accept requests again.
ctx.lifecycle.store(CtxLifecycle.Active)
include ./event_thread
include ./ffi_thread
template closeAndNil(field: untyped) =
if not field.isNil():
?field.close()
field = nil
proc deinitContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
## Mirror of `initContextResources`. Threads MUST be joined first;
## fields are nil'd after close so re-init on the same slot is safe.
ctx.lock.deinitLock()
deinitEventRegistry(ctx[].eventRegistry)
deinitEventQueue(ctx[].eventQueue)
when defined(gcRefc):
# ThreadSignalPtr.close() under refc traps in safeUnregisterAndCloseFd
# → newDispatcher → rawNewObj → signal-handler re-entry (process hangs).
# See tests/test_ffi_context.nim "destroyFFIContext refc workaround".
# Fd leak is bounded — destroy runs once per process lifetime.
discard
else:
closeAndNil(ctx.reqSignal)
closeAndNil(ctx.reqReceivedSignal)
closeAndNil(ctx.stopSignal)
closeAndNil(ctx.threadExitSignal)
closeAndNil(ctx.eventQueueSignal)
closeAndNil(ctx.eventThreadExitSignal)
ok()
proc cleanUpResources[T](ctx: ptr FFIContext[T]): Result[void, string] =
## Deinit + free for heap-allocated contexts.
defer:
freeShared(ctx)
ctx.deinitContextResources()
template newSignalOrErr(field: untyped, name: string) =
field = ThreadSignalPtr.new().valueOr:
return err("couldn't create ThreadSignalPtr: " & name & ": " & $error)
proc initContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
## On failure, the deferred cleanup closes partial state; caller releases
## the slot (freeShared or pool.releaseSlot).
# Nil first so deferred cleanup can't double-close a reused pool slot.
ctx.reqSignal = nil
ctx.reqReceivedSignal = nil
ctx.stopSignal = nil
ctx.threadExitSignal = nil
ctx.eventQueueSignal = nil
ctx.eventThreadExitSignal = nil
ctx.lock.initLock()
initEventRegistry(ctx[].eventRegistry)
initEventQueue(ctx[].eventQueue)
ctx.ffiHeartbeat.store(0)
ctx.eventQueueStuck.store(false)
var success = false
defer:
if not success:
ctx.cleanUpResources().isOkOr:
error "failed to clean up resources after createFFIContext failure",
error = error
newSignalOrErr(ctx.reqSignal, "reqSignal")
newSignalOrErr(ctx.reqReceivedSignal, "reqReceivedSignal")
newSignalOrErr(ctx.stopSignal, "stopSignal")
newSignalOrErr(ctx.threadExitSignal, "threadExitSignal")
newSignalOrErr(ctx.eventQueueSignal, "eventQueueSignal")
newSignalOrErr(ctx.eventThreadExitSignal, "eventThreadExitSignal")
ctx.registeredRequests = addr ffi_types.registeredRequests
ctx.lifecycle.store(CtxLifecycle.Active)
ctx.running.store(true)
try:
createThread(ctx.ffiThread, ffiThreadBody[T], ctx)
except ValueError, ResourceExhaustedError:
return err("failed to create the FFI thread: " & getCurrentExceptionMsg())
try:
createThread(ctx.eventThread, eventThreadBody[T], ctx)
except ValueError, ResourceExhaustedError:
# Join ffiThread before deferred cleanup closes signals it's waiting on.
ctx.running.store(false)
let fireRes = ctx.reqSignal.fireSync()
if fireRes.isErr():
error "failed to signal ffiThread during event-thread cleanup",
error = fireRes.error
joinThread(ctx.ffiThread)
return err("failed to create the event thread: " & getCurrentExceptionMsg())
success = true
ok()
proc fireOrErr(sig: ThreadSignalPtr, name: string): Result[void, string] =
let fired = sig.fireSync().valueOr:
return err("error signaling: " & name & ": " & $error)
if not fired:
return err("failed to signal: " & name & " on time")
ok()
proc reachedExitOrTimedOut(sig: ThreadSignalPtr, timeout: Duration): bool =
## Best-effort bounded pre-check before joining a stopping thread.
## Returns false ONLY on a genuine timeout (the exit signal was not observed
## within `timeout`, so the thread may be wedged and the caller should skip
## the join to avoid hanging). Returns true otherwise — including when
## `waitSync` itself errors: it uses `select()`, which returns EINVAL once a
## signal fd exceeds FD_SETSIZE under load. That error is NOT evidence the
## thread is stuck (it was already signaled to stop and the async event loop
## that drives its exit is unaffected), so we proceed to the authoritative,
## fd-free joinThread rather than spuriously failing teardown and leaking the
## pool slot.
let waited = sig.waitSync(timeout)
if waited.isOk() and not waited.get():
return false # genuine timeout
true
proc signalStop*[T](ctx: ptr FFIContext[T]): Result[void, string] =
# Skip onNotResponding on error: it takes reg.lock, which a back-pressuring
# listener may hold — would deepen the stuck state into a deadlock.
ctx.running.store(false)
?ctx.reqSignal.fireOrErr("reqSignal")
?ctx.stopSignal.fireOrErr("stopSignal")
# Non-fatal: event thread sees running==false on the next tick anyway.
ctx.eventQueueSignal.fireOrErr("eventQueueSignal").isOkOr:
error "failed to signal eventQueueSignal in signalStop", error = error
ok()
## Bound on how long clearContext waits for the FFI thread to exit before
## leaking ctx rather than hanging the caller.
const ThreadExitTimeout* = 1500.milliseconds
proc stopAndJoinThreads*[T](ctx: ptr FFIContext[T]): Result[void, string] =
## On timeout, returns err and skips remaining joins (leaves threads live).
## Caller owns resource cleanup. Skips onNotResponding (same reason as signalStop).
ctx.signalStop().isOkOr:
return err("signalStop failed: " & $error)
if not ctx.threadExitSignal.reachedExitOrTimedOut(ThreadExitTimeout):
return err("FFI thread did not exit in time (leaking ctx to avoid hang)")
joinThread(ctx.ffiThread)
if not ctx.eventThreadExitSignal.reachedExitOrTimedOut(ThreadExitTimeout):
return err("event thread did not exit in time (leaking ctx to avoid hang)")
joinThread(ctx.eventThread)
ok()
proc clearContext[T](ctx: ptr FFIContext[T]): Result[void, string] =
## Stops a heap-allocated FFI context.
ctx.stopAndJoinThreads().isOkOr:
return err("clearContext: " & $error)
ctx.cleanUpResources().isOkOr:
return err("cleanUpResources failed: " & $error)
ok()
proc requestRecycle*[T](
ctx: ptr FFIContext[T], callback: FFICallBack, userData: pointer
): Result[void, string] =
## Starts the context's recycle WITHOUT stopping its worker threads, so the
## next createFFIContext reuses the same threads, signals and kqueue fds.
## The FFI thread loop drains the in-flight handlers, frees the lib, clears the
## per-context state and releases the slot, then fires `callback`
## (RET_OK drained, RET_ERR stuck). Non-blocking.
ctx.lock.acquire()
if ctx.lifecycle.load() != CtxLifecycle.Active:
ctx.lock.release()
return err("requestRecycle: context is not Active (already recycling)")
ctx.recycleCallback = callback
ctx.recycleUserData = userData
ctx.lifecycle.store(CtxLifecycle.RecyclePending)
ctx.lock.release()
let fired = ctx.reqSignal.fireSync().valueOr:
return err("requestRecycle: failed to signal the FFI thread: " & $error)
if not fired:
return err("requestRecycle: failed to signal the FFI thread in time")
ok()