## FFIContext type plus lifecycle (init / signal-stop / join / destroy). ## ## The per-thread bodies live in `ffi_thread.nim` and `event_thread.nim`, ## included below so the thread code can access the private FFIContext ## fields without forcing them through a public surface. {.passc: "-fPIC".} # Embedded in a foreign host (Go/Rust/...) the host must own OS signal handling; # Nim installing its own handlers clobbers it (e.g. Go's SIGSEGV -> sigpanic). # Enforce -d:noSignalHandler; standalone Nim binaries opt out via -d:ffiAllowSignalHandler. when not defined(noSignalHandler) and not defined(ffiAllowSignalHandler): {. error: "nim-ffi: missing required compile flag. If this library is embedded in a " & "host process (Go/Rust/...), build with -d:noSignalHandler so the host keeps " & "ownership of OS signal handlers (it needs SIGSEGV for crash recovery, stack " & "growth and preemption). If instead this is a standalone Nim program that owns " & "its own process, build with -d:ffiAllowSignalHandler." .} import std/[atomics, locks, options, sequtils, tables] import chronicles, chronos, chronos/threadsync, taskpools/channels_spsc_single, results import ./ffi_types, ./ffi_events, ./ffi_thread_request, ./logging, ./cbor_serial export ffi_events type CtxLifecycle {.pure.} = enum ## State machine guarding a pooled FFI context, held as an Atomic on FFIContext. ## The threads, signals and dispatcher kqueues are created once per slot and ## REUSED across acquire/release — chronos never frees a dispatcher's kqueue fd ## (design decision; freed only at process exit), so spawning a thread per ## context would leak fds unboundedly. Recycling parks the context instead. ## Transitions: ## Active -> RecyclePending when the destructor is invoked ## RecyclePending -> Recycling FFI loop drains handlers, frees lib, releases slot ## Recycling -> Active next createFFIContext reuses the slot (markAsActive) Active ## accepting and serving requests RecyclePending ## recycle requested; FFI thread loop hasn't claimed it yet Recycling ## FFI loop draining handlers, then frees lib + returns to pool type FFIContext*[T] = object myLib*: ptr T # main library object (Waku, LibP2P, SDS, …) ffiThread: Thread[(ptr FFIContext[T])] eventThread: Thread[(ptr FFIContext[T])] lock: Lock reqChannel: ChannelSPSCSingle[ptr FFIThreadRequest] reqSignal: ThreadSignalPtr reqReceivedSignal: ThreadSignalPtr stopSignal: ThreadSignalPtr threadExitSignal: ThreadSignalPtr # bounds destroyFFIContext's wait so a blocked loop cannot hang the caller eventQueueSignal: ThreadSignalPtr # wakes the event thread on enqueue eventThreadExitSignal: ThreadSignalPtr # mirrors threadExitSignal for the event thread userData*: pointer eventRegistry*: FFIEventRegistry eventQueue*: EventQueue ffiHeartbeat*: Atomic[int64] # advanced each FFI-thread loop; event thread reads for liveness eventQueueStuck*: Atomic[bool] # sticky overflow flag running: Atomic[bool] # To control when the threads are running lifecycle: Atomic[CtxLifecycle] # Active / RecyclePending / Recycling recycleCallback: FFICallBack # destructor's callback, fired by the recycle handler with the outcome: # RET_OK once drained, RET_ERR if it timed out. Set by requestRecycle. recycleUserData: pointer inUse: Atomic[bool] # whether the slot is claimed; createFFIContext claims it, the recycle # handler clears it once drained so the owning thread can release without # reaching into the pool. registeredRequests: ptr Table[cstring, FFIRequestProc] var onFFIThread* {.threadvar.}: bool # Re-entrant dispatch guard for `sendRequestToFFIThread`. const git_version* {.strdefine.} = "n/a" const EventThreadTickInterval* = 1.seconds FFIHeartbeatStartDelay* = 10.seconds # grace window for library startup FFIHeartbeatStaleThreshold* = 1.seconds proc tryClaim*[T](ctx: ptr FFIContext[T]): bool = ## Returns true if the slot was free and is now claimed, false if already in use. var expected = false ctx.inUse.compareExchange(expected, true) proc release*[T](ctx: ptr FFIContext[T]) = ctx.inUse.store(false) proc isInUse*[T](ctx: ptr FFIContext[T]): bool = ctx.inUse.load() proc markAsActive*[T](ctx: ptr FFIContext[T]) = ## Re-arms a reused (recycled) slot to accept requests again. ctx.lifecycle.store(CtxLifecycle.Active) include ./event_thread include ./ffi_thread template closeAndNil(field: untyped) = if not field.isNil(): ?field.close() field = nil proc deinitContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] = ## Mirror of `initContextResources`. Threads MUST be joined first; ## fields are nil'd after close so re-init on the same slot is safe. ctx.lock.deinitLock() deinitEventRegistry(ctx[].eventRegistry) deinitEventQueue(ctx[].eventQueue) when defined(gcRefc): # ThreadSignalPtr.close() under refc traps in safeUnregisterAndCloseFd # → newDispatcher → rawNewObj → signal-handler re-entry (process hangs). # See tests/test_ffi_context.nim "destroyFFIContext refc workaround". # Fd leak is bounded — destroy runs once per process lifetime. discard else: closeAndNil(ctx.reqSignal) closeAndNil(ctx.reqReceivedSignal) closeAndNil(ctx.stopSignal) closeAndNil(ctx.threadExitSignal) closeAndNil(ctx.eventQueueSignal) closeAndNil(ctx.eventThreadExitSignal) ok() proc cleanUpResources[T](ctx: ptr FFIContext[T]): Result[void, string] = ## Deinit + free for heap-allocated contexts. defer: freeShared(ctx) ctx.deinitContextResources() template newSignalOrErr(field: untyped, name: string) = field = ThreadSignalPtr.new().valueOr: return err("couldn't create ThreadSignalPtr: " & name & ": " & $error) proc initContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] = ## On failure, the deferred cleanup closes partial state; caller releases ## the slot (freeShared or pool.releaseSlot). # Nil first so deferred cleanup can't double-close a reused pool slot. ctx.reqSignal = nil ctx.reqReceivedSignal = nil ctx.stopSignal = nil ctx.threadExitSignal = nil ctx.eventQueueSignal = nil ctx.eventThreadExitSignal = nil ctx.lock.initLock() initEventRegistry(ctx[].eventRegistry) initEventQueue(ctx[].eventQueue) ctx.ffiHeartbeat.store(0) ctx.eventQueueStuck.store(false) var success = false defer: if not success: ctx.cleanUpResources().isOkOr: error "failed to clean up resources after createFFIContext failure", error = error newSignalOrErr(ctx.reqSignal, "reqSignal") newSignalOrErr(ctx.reqReceivedSignal, "reqReceivedSignal") newSignalOrErr(ctx.stopSignal, "stopSignal") newSignalOrErr(ctx.threadExitSignal, "threadExitSignal") newSignalOrErr(ctx.eventQueueSignal, "eventQueueSignal") newSignalOrErr(ctx.eventThreadExitSignal, "eventThreadExitSignal") ctx.registeredRequests = addr ffi_types.registeredRequests ctx.lifecycle.store(CtxLifecycle.Active) ctx.running.store(true) try: createThread(ctx.ffiThread, ffiThreadBody[T], ctx) except ValueError, ResourceExhaustedError: return err("failed to create the FFI thread: " & getCurrentExceptionMsg()) try: createThread(ctx.eventThread, eventThreadBody[T], ctx) except ValueError, ResourceExhaustedError: # Join ffiThread before deferred cleanup closes signals it's waiting on. ctx.running.store(false) let fireRes = ctx.reqSignal.fireSync() if fireRes.isErr(): error "failed to signal ffiThread during event-thread cleanup", error = fireRes.error joinThread(ctx.ffiThread) return err("failed to create the event thread: " & getCurrentExceptionMsg()) success = true ok() proc fireOrErr(sig: ThreadSignalPtr, name: string): Result[void, string] = let fired = sig.fireSync().valueOr: return err("error signaling: " & name & ": " & $error) if not fired: return err("failed to signal: " & name & " on time") ok() proc reachedExitOrTimedOut(sig: ThreadSignalPtr, timeout: Duration): bool = ## Best-effort bounded pre-check before joining a stopping thread. ## Returns false ONLY on a genuine timeout (the exit signal was not observed ## within `timeout`, so the thread may be wedged and the caller should skip ## the join to avoid hanging). Returns true otherwise — including when ## `waitSync` itself errors: it uses `select()`, which returns EINVAL once a ## signal fd exceeds FD_SETSIZE under load. That error is NOT evidence the ## thread is stuck (it was already signaled to stop and the async event loop ## that drives its exit is unaffected), so we proceed to the authoritative, ## fd-free joinThread rather than spuriously failing teardown and leaking the ## pool slot. let waited = sig.waitSync(timeout) if waited.isOk() and not waited.get(): return false # genuine timeout true proc signalStop*[T](ctx: ptr FFIContext[T]): Result[void, string] = # Skip onNotResponding on error: it takes reg.lock, which a back-pressuring # listener may hold — would deepen the stuck state into a deadlock. ctx.running.store(false) ?ctx.reqSignal.fireOrErr("reqSignal") ?ctx.stopSignal.fireOrErr("stopSignal") # Non-fatal: event thread sees running==false on the next tick anyway. ctx.eventQueueSignal.fireOrErr("eventQueueSignal").isOkOr: error "failed to signal eventQueueSignal in signalStop", error = error ok() ## Bound on how long clearContext waits for the FFI thread to exit before ## leaking ctx rather than hanging the caller. const ThreadExitTimeout* = 1500.milliseconds proc stopAndJoinThreads*[T](ctx: ptr FFIContext[T]): Result[void, string] = ## On timeout, returns err and skips remaining joins (leaves threads live). ## Caller owns resource cleanup. Skips onNotResponding (same reason as signalStop). ctx.signalStop().isOkOr: return err("signalStop failed: " & $error) if not ctx.threadExitSignal.reachedExitOrTimedOut(ThreadExitTimeout): return err("FFI thread did not exit in time (leaking ctx to avoid hang)") joinThread(ctx.ffiThread) if not ctx.eventThreadExitSignal.reachedExitOrTimedOut(ThreadExitTimeout): return err("event thread did not exit in time (leaking ctx to avoid hang)") joinThread(ctx.eventThread) ok() proc clearContext[T](ctx: ptr FFIContext[T]): Result[void, string] = ## Stops a heap-allocated FFI context. ctx.stopAndJoinThreads().isOkOr: return err("clearContext: " & $error) ctx.cleanUpResources().isOkOr: return err("cleanUpResources failed: " & $error) ok() proc requestRecycle*[T]( ctx: ptr FFIContext[T], callback: FFICallBack, userData: pointer ): Result[void, string] = ## Starts the context's recycle WITHOUT stopping its worker threads, so the ## next createFFIContext reuses the same threads, signals and kqueue fds. ## The FFI thread loop drains the in-flight handlers, frees the lib, clears the ## per-context state and releases the slot, then fires `callback` ## (RET_OK drained, RET_ERR stuck). Non-blocking. ctx.lock.acquire() if ctx.lifecycle.load() != CtxLifecycle.Active: ctx.lock.release() return err("requestRecycle: context is not Active (already recycling)") ctx.recycleCallback = callback ctx.recycleUserData = userData ctx.lifecycle.store(CtxLifecycle.RecyclePending) ctx.lock.release() let fired = ctx.reqSignal.fireSync().valueOr: return err("requestRecycle: failed to signal the FFI thread: " & $error) if not fired: return err("requestRecycle: failed to signal the FFI thread in time") ok()