mirror of
https://github.com/logos-messaging/nim-ffi.git
synced 2026-06-21 00:40:16 +00:00
Merge b156b84ff3fe1df89b1c4e2dc034672ceded88b7 into 021f46904171401338c9c49648c06e55f8f59aeb
This commit is contained in:
commit
30f833e355
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@ -2,9 +2,9 @@ name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master, main]
|
||||
branches: [master, main, 'release/*']
|
||||
pull_request:
|
||||
branches: [master, main]
|
||||
branches: [master, main, 'release/*']
|
||||
|
||||
jobs:
|
||||
# Single source of truth for Nim / Nimble versions used by every job and
|
||||
|
||||
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@ -83,4 +83,4 @@ jobs:
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
export PATH="$GITHUB_WORKSPACE/.nim_runtime/bin:$HOME/.nimble/bin:$PATH"
|
||||
fi
|
||||
nim c -r --mm:${{ matrix.mm }} -d:chronicles_log_level=WARN tests/unit/${{ inputs.test }}.nim
|
||||
nim c -r --mm:${{ matrix.mm }} -d:chronicles_log_level=WARN -d:ffiAllowSignalHandler tests/unit/${{ inputs.test }}.nim
|
||||
|
||||
37
CHANGELOG.md
37
CHANGELOG.md
@ -4,32 +4,12 @@ All notable changes to this project are documented in this file.
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Changed
|
||||
- User event callbacks now run on a dedicated event thread fed by a
|
||||
bounded SPSC queue (default capacity 1024), so a slow listener can no
|
||||
longer block the FFI thread or concurrent `add_event_listener` /
|
||||
`remove_event_listener` calls
|
||||
([#6](https://github.com/logos-messaging/nim-ffi/issues/6)).
|
||||
- Replaced the dedicated watchdog thread with a heartbeat check that
|
||||
runs on the event thread. The FFI thread advances an atomic heartbeat
|
||||
each loop iteration; if it stalls for more than 1s past the start-up
|
||||
grace window, the event thread emits the `not_responding` event.
|
||||
|
||||
### Added
|
||||
- Queue-overflow handling: when the bounded event queue is full, the
|
||||
library sets a sticky "stuck" flag, logs an error, fires
|
||||
`not_responding` from the event thread, and rejects subsequent
|
||||
`sendRequestToFFIThread` calls with `event queue stuck - library
|
||||
cannot accept new requests`.
|
||||
|
||||
## [0.2.0] - 2026-06-04
|
||||
|
||||
Major release introducing the CBOR-based wire format, CBOR-backed FFI events
|
||||
with a multi-listener registry, multi-language binding generation (C++, Rust,
|
||||
CDDL), CI hardening with sanitizers, and several robustness fixes around
|
||||
context lifetime and memory safety.
|
||||
|
||||
### Added
|
||||
- **CBOR serialization** as the FFI wire format, replacing the previous
|
||||
JSON/string-based `serial.nim`
|
||||
([#23](https://github.com/logos-messaging/nim-ffi/pull/23)).
|
||||
@ -64,6 +44,21 @@ context lifetime and memory safety.
|
||||
([#41](https://github.com/logos-messaging/nim-ffi/pull/41)).
|
||||
|
||||
### Changed
|
||||
- Enforce `-d:noSignalHandler` at compile time: the build now fails with an
|
||||
actionable message unless the consumer sets `-d:noSignalHandler` (libraries
|
||||
embedded in a foreign host such as Go/Rust) or `-d:ffiAllowSignalHandler`
|
||||
(standalone Nim binaries). Prevents the Nim runtime from installing its own
|
||||
OS signal handlers and clobbering the host's — the cause of a real status-go
|
||||
regression.
|
||||
- User event callbacks now run on a dedicated event thread fed by a
|
||||
bounded SPSC queue (default capacity 1024), so a slow listener can no
|
||||
longer block the FFI thread or concurrent `add_event_listener` /
|
||||
`remove_event_listener` calls
|
||||
([#6](https://github.com/logos-messaging/nim-ffi/issues/6)).
|
||||
- Replaced the dedicated watchdog thread with a heartbeat check that
|
||||
runs on the event thread. The FFI thread advances an atomic heartbeat
|
||||
each loop iteration; if it stalls for more than 1s past the start-up
|
||||
grace window, the event thread emits the `not_responding` event.
|
||||
- Removed the redundant `ffiType` macro; the `ffi` macro is now the single
|
||||
authoring entry point
|
||||
([#22](https://github.com/logos-messaging/nim-ffi/pull/22)).
|
||||
@ -91,7 +86,7 @@ context lifetime and memory safety.
|
||||
and `rust_client`, the latter with a Tokio async variant) (#15).
|
||||
- JSON/string-based FFI (de)serialization via `ffi/serial.nim`
|
||||
(`ffiSerialize`/`ffiDeserialize`), with `tests/test_serial.nim` coverage.
|
||||
(CBOR replaced this layer later, in 0.2.0.)
|
||||
(CBOR replaced this layer later.)
|
||||
- FFI context pool (`ffi/ffi_context_pool.nim`) using a fixed array of contexts.
|
||||
- Test suite expansion: `test_alloc.nim`, `test_ctx_validation.nim`,
|
||||
`test_ffi_context.nim`, `test_gc_compat.nim`.
|
||||
|
||||
@ -58,6 +58,7 @@ add_custom_command(
|
||||
COMMAND "${NIM_EXECUTABLE}" c
|
||||
--mm:orc
|
||||
-d:chronicles_log_level=WARN
|
||||
-d:noSignalHandler
|
||||
--app:lib
|
||||
--noMain
|
||||
"--nimMainPrefix:libecho"
|
||||
|
||||
@ -406,7 +406,7 @@ typedef void (*FFICallback)(int ret, const char* msg, size_t len, void* user_dat
|
||||
void* echo_create(const uint8_t* req_cbor, size_t req_cbor_len, FFICallback callback, void* user_data);
|
||||
int echo_shout(void* ctx, FFICallback callback, void* user_data, const uint8_t* req_cbor, size_t req_cbor_len);
|
||||
int echo_version(void* ctx, FFICallback callback, void* user_data, const uint8_t* req_cbor, size_t req_cbor_len);
|
||||
int echo_destroy(void* ctx);
|
||||
int echo_destroy(void* ctx, FFICallback callback, void* user_data);
|
||||
uint64_t echo_add_event_listener(void* ctx, const char* event_name, FFICallback callback, void* user_data);
|
||||
int echo_remove_event_listener(void* ctx, uint64_t listener_id);
|
||||
} // extern "C"
|
||||
@ -516,7 +516,14 @@ public:
|
||||
// context.
|
||||
~EchoCtx() {
|
||||
if (ptr_) {
|
||||
echo_destroy(ptr_);
|
||||
// `echo_destroy` is non-blocking at the C ABI: it parks the
|
||||
// context for reuse and reports the outcome via the callback. Block
|
||||
// here until that callback fires so the pool slot is fully drained
|
||||
// and parked before this object goes away — otherwise a rapid
|
||||
// create/destroy churn could outrun the recycle and exhaust the pool.
|
||||
(void)ffi_call_([this](FFICallback cb, void* ud) {
|
||||
return echo_destroy(ptr_, cb, ud);
|
||||
}, timeout_);
|
||||
ptr_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -9,7 +9,7 @@ requires "nim >= 2.2.4"
|
||||
requires "chronos"
|
||||
requires "chronicles"
|
||||
requires "taskpools"
|
||||
requires "https://github.com/logos-messaging/nim-ffi >= 0.2.0"
|
||||
requires "https://github.com/logos-messaging/nim-ffi >= 0.1.4"
|
||||
|
||||
const nimFlags = "--mm:orc -d:chronicles_log_level=WARN"
|
||||
|
||||
|
||||
@ -58,6 +58,7 @@ add_custom_command(
|
||||
COMMAND "${NIM_EXECUTABLE}" c
|
||||
--mm:orc
|
||||
-d:chronicles_log_level=WARN
|
||||
-d:noSignalHandler
|
||||
--app:lib
|
||||
--noMain
|
||||
"--nimMainPrefix:libmy_timer"
|
||||
|
||||
@ -706,7 +706,7 @@ int my_timer_echo(void* ctx, FFICallback callback, void* user_data, const uint8_
|
||||
int my_timer_version(void* ctx, FFICallback callback, void* user_data, const uint8_t* req_cbor, size_t req_cbor_len);
|
||||
int my_timer_complex(void* ctx, FFICallback callback, void* user_data, const uint8_t* req_cbor, size_t req_cbor_len);
|
||||
int my_timer_schedule(void* ctx, FFICallback callback, void* user_data, const uint8_t* req_cbor, size_t req_cbor_len);
|
||||
int my_timer_destroy(void* ctx);
|
||||
int my_timer_destroy(void* ctx, FFICallback callback, void* user_data);
|
||||
uint64_t my_timer_add_event_listener(void* ctx, const char* event_name, FFICallback callback, void* user_data);
|
||||
int my_timer_remove_event_listener(void* ctx, uint64_t listener_id);
|
||||
} // extern "C"
|
||||
@ -816,7 +816,14 @@ public:
|
||||
// context.
|
||||
~MyTimerCtx() {
|
||||
if (ptr_) {
|
||||
my_timer_destroy(ptr_);
|
||||
// `my_timer_destroy` is non-blocking at the C ABI: it parks the
|
||||
// context for reuse and reports the outcome via the callback. Block
|
||||
// here until that callback fires so the pool slot is fully drained
|
||||
// and parked before this object goes away — otherwise a rapid
|
||||
// create/destroy churn could outrun the recycle and exhaust the pool.
|
||||
(void)ffi_call_([this](FFICallback cb, void* ud) {
|
||||
return my_timer_destroy(ptr_, cb, ud);
|
||||
}, timeout_);
|
||||
ptr_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -32,6 +32,7 @@ fn main() {
|
||||
cmd.arg("c")
|
||||
.arg("--mm:orc")
|
||||
.arg("-d:chronicles_log_level=WARN")
|
||||
.arg("-d:noSignalHandler")
|
||||
.arg("--app:lib")
|
||||
.arg("--noMain")
|
||||
.arg(format!("--nimMainPrefix:libmy_timer"))
|
||||
|
||||
@ -141,7 +141,9 @@ unsafe impl Sync for MyTimerCtx {}
|
||||
impl Drop for MyTimerCtx {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
unsafe { ffi::my_timer_destroy(self.ptr); }
|
||||
let _ = ffi_call_sync(self.timeout, |cb, ud| unsafe {
|
||||
ffi::my_timer_destroy(self.ptr, cb, ud)
|
||||
});
|
||||
self.ptr = std::ptr::null_mut();
|
||||
}
|
||||
}
|
||||
|
||||
@ -14,7 +14,7 @@ extern "C" {
|
||||
pub fn my_timer_version(ctx: *mut c_void, callback: FFICallback, user_data: *mut c_void, req_cbor: *const u8, req_cbor_len: usize) -> c_int;
|
||||
pub fn my_timer_complex(ctx: *mut c_void, callback: FFICallback, user_data: *mut c_void, req_cbor: *const u8, req_cbor_len: usize) -> c_int;
|
||||
pub fn my_timer_schedule(ctx: *mut c_void, callback: FFICallback, user_data: *mut c_void, req_cbor: *const u8, req_cbor_len: usize) -> c_int;
|
||||
pub fn my_timer_destroy(ctx: *mut c_void) -> c_int;
|
||||
pub fn my_timer_destroy(ctx: *mut c_void, callback: FFICallback, user_data: *mut c_void) -> c_int;
|
||||
pub fn my_timer_add_event_listener(ctx: *mut c_void, event_name: *const c_char, callback: FFICallback, user_data: *mut c_void) -> u64;
|
||||
pub fn my_timer_remove_event_listener(ctx: *mut c_void, listener_id: u64) -> c_int;
|
||||
}
|
||||
|
||||
@ -8,7 +8,7 @@ requires "nim >= 2.2.4"
|
||||
requires "chronos"
|
||||
requires "chronicles"
|
||||
requires "taskpools"
|
||||
requires "https://github.com/logos-messaging/nim-ffi >= 0.2.0"
|
||||
requires "https://github.com/logos-messaging/nim-ffi >= 0.1.4"
|
||||
|
||||
const nimFlags = "--mm:orc -d:chronicles_log_level=WARN"
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# ffi.nimble
|
||||
|
||||
version = "0.2.0"
|
||||
version = "0.1.4"
|
||||
author = "Institute of Free Technology"
|
||||
description = "FFI framework with custom header generation"
|
||||
license = "MIT or Apache License 2.0"
|
||||
@ -13,8 +13,8 @@ requires "chronicles"
|
||||
requires "taskpools"
|
||||
requires "cbor_serialization == 0.3.0"
|
||||
|
||||
const nimFlagsOrc = "--mm:orc -d:chronicles_log_level=WARN"
|
||||
const nimFlagsRefc = "--mm:refc -d:chronicles_log_level=WARN"
|
||||
const nimFlagsOrc = "--mm:orc -d:chronicles_log_level=WARN -d:ffiAllowSignalHandler"
|
||||
const nimFlagsRefc = "--mm:refc -d:chronicles_log_level=WARN -d:ffiAllowSignalHandler"
|
||||
|
||||
import std/[algorithm, os, strutils]
|
||||
|
||||
|
||||
@ -339,7 +339,9 @@ proc generateCppHeader*(
|
||||
[p.procName]
|
||||
)
|
||||
of FFIKind.DTOR:
|
||||
lines.add("int $1(void* ctx);" % [p.procName])
|
||||
lines.add(
|
||||
"int $1(void* ctx, FFICallback callback, void* user_data);" % [p.procName]
|
||||
)
|
||||
# `declareLibrary` always exports the listener-registration ABI. Declare
|
||||
# it here so the typed event-handler wiring below can call into it.
|
||||
lines.add(
|
||||
@ -570,8 +572,8 @@ proc generateCppHeader*(
|
||||
lines.add(" std::chrono::milliseconds timeout_;")
|
||||
if events.len > 0:
|
||||
# One owning entry per live listener, keyed by id. Destroyed after
|
||||
# the destructor body runs `<lib>_destroy(ptr_)`, by which point the
|
||||
# FFI side has joined its threads so no callback is mid-flight.
|
||||
# the destructor blocks on `<lib>_destroy`'s recycle callback, by which
|
||||
# point the FFI side has drained/parked the slot so no callback is mid-flight.
|
||||
lines.add(
|
||||
" std::unordered_map<std::uint64_t, std::unique_ptr<ListenerBase>> listeners_;"
|
||||
)
|
||||
|
||||
@ -128,6 +128,7 @@ fn main() {
|
||||
cmd.arg("c")
|
||||
.arg("--mm:orc")
|
||||
.arg("-d:chronicles_log_level=WARN")
|
||||
.arg("-d:noSignalHandler")
|
||||
.arg("--app:lib")
|
||||
.arg("--noMain")
|
||||
.arg(format!("--nimMainPrefix:lib$2"))
|
||||
@ -206,6 +207,8 @@ proc generateFFIRs*(procs: seq[FFIProcMeta]): string =
|
||||
lines.add(" pub fn $1($2) -> *mut c_void;" % [p.procName, params.join(", ")])
|
||||
of FFIKind.DTOR:
|
||||
params.add("ctx: *mut c_void")
|
||||
params.add("callback: FFICallback")
|
||||
params.add("user_data: *mut c_void")
|
||||
lines.add(" pub fn $1($2) -> c_int;" % [p.procName, params.join(", ")])
|
||||
|
||||
# Listener-registration ABI — emitted on the Nim side by `declareLibrary`,
|
||||
@ -530,7 +533,14 @@ proc generateApiRs*(
|
||||
lines.add("impl Drop for $1 {" % [ctxTypeName])
|
||||
lines.add(" fn drop(&mut self) {")
|
||||
lines.add(" if !self.ptr.is_null() {")
|
||||
lines.add(" unsafe { ffi::$1(self.ptr); }" % [dtorProcName])
|
||||
# `<lib>_destroy` is non-blocking at the C ABI: it parks the context for
|
||||
# reuse and reports the outcome via the callback. Block until that callback
|
||||
# fires so the pool slot is fully drained and parked before this handle goes
|
||||
# away — otherwise rapid create/destroy churn could outrun the recycle and
|
||||
# exhaust the pool. The recycle outcome is best-effort on drop, so discard it.
|
||||
lines.add(" let _ = ffi_call_sync(self.timeout, |cb, ud| unsafe {")
|
||||
lines.add(" ffi::$1(self.ptr, cb, ud)" % [dtorProcName])
|
||||
lines.add(" });")
|
||||
lines.add(" self.ptr = std::ptr::null_mut();")
|
||||
lines.add(" }")
|
||||
# `listeners` is dropped automatically after this body returns. By
|
||||
|
||||
@ -58,6 +58,7 @@ add_custom_command(
|
||||
COMMAND "${NIM_EXECUTABLE}" c
|
||||
--mm:orc
|
||||
-d:chronicles_log_level=WARN
|
||||
-d:noSignalHandler
|
||||
--app:lib
|
||||
--noMain
|
||||
"--nimMainPrefix:lib{{LIB}}"
|
||||
|
||||
@ -9,7 +9,14 @@
|
||||
// context.
|
||||
~{{CTX}}() {
|
||||
if (ptr_) {
|
||||
{{LIB}}_destroy(ptr_);
|
||||
// `{{LIB}}_destroy` is non-blocking at the C ABI: it parks the
|
||||
// context for reuse and reports the outcome via the callback. Block
|
||||
// here until that callback fires so the pool slot is fully drained
|
||||
// and parked before this object goes away — otherwise a rapid
|
||||
// create/destroy churn could outrun the recycle and exhaust the pool.
|
||||
(void)ffi_call_([this](FFICallback cb, void* ud) {
|
||||
return {{LIB}}_destroy(ptr_, cb, ud);
|
||||
}, timeout_);
|
||||
ptr_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -6,7 +6,20 @@
|
||||
|
||||
{.passc: "-fPIC".}
|
||||
|
||||
import std/[atomics, locks, options, tables]
|
||||
# Embedded in a foreign host (Go/Rust/...) the host must own OS signal handling;
|
||||
# Nim installing its own handlers clobbers it (e.g. Go's SIGSEGV -> sigpanic).
|
||||
# Enforce -d:noSignalHandler; standalone Nim binaries opt out via -d:ffiAllowSignalHandler.
|
||||
when not defined(noSignalHandler) and not defined(ffiAllowSignalHandler):
|
||||
{.
|
||||
error:
|
||||
"nim-ffi: missing required compile flag. If this library is embedded in a " &
|
||||
"host process (Go/Rust/...), build with -d:noSignalHandler so the host keeps " &
|
||||
"ownership of OS signal handlers (it needs SIGSEGV for crash recovery, stack " &
|
||||
"growth and preemption). If instead this is a standalone Nim program that owns " &
|
||||
"its own process, build with -d:ffiAllowSignalHandler."
|
||||
.}
|
||||
|
||||
import std/[atomics, locks, options, sequtils, tables]
|
||||
import chronicles, chronos, chronos/threadsync, taskpools/channels_spsc_single, results
|
||||
import
|
||||
./ffi_types,
|
||||
@ -18,6 +31,20 @@ import
|
||||
|
||||
export ffi_events, ffi_handles
|
||||
|
||||
type CtxLifecycle {.pure.} = enum
|
||||
## State machine guarding a pooled FFI context, held as an Atomic on FFIContext.
|
||||
## The threads, signals and dispatcher kqueues are created once per slot and
|
||||
## REUSED across acquire/release — chronos never frees a dispatcher's kqueue fd
|
||||
## (design decision; freed only at process exit), so spawning a thread per
|
||||
## context would leak fds unboundedly. Recycling parks the context instead.
|
||||
## Transitions:
|
||||
## Active -> RecyclePending when the destructor is invoked
|
||||
## RecyclePending -> Recycling FFI loop drains handlers, frees lib, releases slot
|
||||
## Recycling -> Active next createFFIContext reuses the slot (markAsActive)
|
||||
Active ## accepting and serving requests
|
||||
RecyclePending ## recycle requested; FFI thread loop hasn't claimed it yet
|
||||
Recycling ## FFI loop draining handlers, then frees lib + returns to pool
|
||||
|
||||
type FFIContext*[T] = object
|
||||
myLib*: ptr T # main library object (Waku, LibP2P, SDS, …)
|
||||
ffiThread: Thread[(ptr FFIContext[T])]
|
||||
@ -39,6 +66,15 @@ type FFIContext*[T] = object
|
||||
# advanced each FFI-thread loop; event thread reads for liveness
|
||||
eventQueueStuck*: Atomic[bool] # sticky overflow flag
|
||||
running: Atomic[bool] # To control when the threads are running
|
||||
lifecycle: Atomic[CtxLifecycle] # Active / RecyclePending / Recycling
|
||||
recycleCallback: FFICallBack
|
||||
# destructor's callback, fired by the recycle handler with the outcome:
|
||||
# RET_OK once drained, RET_ERR if it timed out. Set by requestRecycle.
|
||||
recycleUserData: pointer
|
||||
inUse: Atomic[bool]
|
||||
# whether the slot is claimed; createFFIContext claims it, the recycle
|
||||
# handler clears it once drained so the owning thread can release without
|
||||
# reaching into the pool.
|
||||
registeredRequests: ptr Table[cstring, FFIRequestProc]
|
||||
|
||||
var onFFIThread* {.threadvar.}: bool
|
||||
@ -51,6 +87,21 @@ const
|
||||
FFIHeartbeatStartDelay* = 10.seconds # grace window for library startup
|
||||
FFIHeartbeatStaleThreshold* = 1.seconds
|
||||
|
||||
proc tryClaim*[T](ctx: ptr FFIContext[T]): bool =
|
||||
## Returns true if the slot was free and is now claimed, false if already in use.
|
||||
var expected = false
|
||||
ctx.inUse.compareExchange(expected, true)
|
||||
|
||||
proc release*[T](ctx: ptr FFIContext[T]) =
|
||||
ctx.inUse.store(false)
|
||||
|
||||
proc isInUse*[T](ctx: ptr FFIContext[T]): bool =
|
||||
ctx.inUse.load()
|
||||
|
||||
proc markAsActive*[T](ctx: ptr FFIContext[T]) =
|
||||
## Re-arms a reused (recycled) slot to accept requests again.
|
||||
ctx.lifecycle.store(CtxLifecycle.Active)
|
||||
|
||||
include ./event_thread
|
||||
include ./ffi_thread
|
||||
|
||||
@ -124,6 +175,7 @@ proc initContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
|
||||
ctx.registeredRequests = addr ffi_types.registeredRequests
|
||||
|
||||
ctx.lifecycle.store(CtxLifecycle.Active)
|
||||
ctx.running.store(true)
|
||||
|
||||
try:
|
||||
@ -153,14 +205,21 @@ proc fireOrErr(sig: ThreadSignalPtr, name: string): Result[void, string] =
|
||||
return err("failed to signal: " & name & " on time")
|
||||
ok()
|
||||
|
||||
proc waitExitOrErr(
|
||||
sig: ThreadSignalPtr, name: string, timeout: Duration
|
||||
): Result[void, string] =
|
||||
let exited = sig.waitSync(timeout).valueOr:
|
||||
return err("error waiting for exit: " & name & ": " & $error)
|
||||
if not exited:
|
||||
return err("did not exit in time: " & name & " (leaking ctx to avoid hang)")
|
||||
ok()
|
||||
proc reachedExitOrTimedOut(sig: ThreadSignalPtr, timeout: Duration): bool =
|
||||
## Best-effort bounded pre-check before joining a stopping thread.
|
||||
## Returns false ONLY on a genuine timeout (the exit signal was not observed
|
||||
## within `timeout`, so the thread may be wedged and the caller should skip
|
||||
## the join to avoid hanging). Returns true otherwise — including when
|
||||
## `waitSync` itself errors: it uses `select()`, which returns EINVAL once a
|
||||
## signal fd exceeds FD_SETSIZE under load. That error is NOT evidence the
|
||||
## thread is stuck (it was already signaled to stop and the async event loop
|
||||
## that drives its exit is unaffected), so we proceed to the authoritative,
|
||||
## fd-free joinThread rather than spuriously failing teardown and leaking the
|
||||
## pool slot.
|
||||
let waited = sig.waitSync(timeout)
|
||||
if waited.isOk() and not waited.get():
|
||||
return false # genuine timeout
|
||||
true
|
||||
|
||||
proc signalStop*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
# Skip onNotResponding on error: it takes reg.lock, which a back-pressuring
|
||||
@ -183,9 +242,11 @@ proc stopAndJoinThreads*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
ctx.signalStop().isOkOr:
|
||||
return err("signalStop failed: " & $error)
|
||||
|
||||
?ctx.threadExitSignal.waitExitOrErr("FFI thread", ThreadExitTimeout)
|
||||
if not ctx.threadExitSignal.reachedExitOrTimedOut(ThreadExitTimeout):
|
||||
return err("FFI thread did not exit in time (leaking ctx to avoid hang)")
|
||||
joinThread(ctx.ffiThread)
|
||||
?ctx.eventThreadExitSignal.waitExitOrErr("event thread", ThreadExitTimeout)
|
||||
if not ctx.eventThreadExitSignal.reachedExitOrTimedOut(ThreadExitTimeout):
|
||||
return err("event thread did not exit in time (leaking ctx to avoid hang)")
|
||||
joinThread(ctx.eventThread)
|
||||
ok()
|
||||
|
||||
@ -196,3 +257,26 @@ proc clearContext[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
ctx.cleanUpResources().isOkOr:
|
||||
return err("cleanUpResources failed: " & $error)
|
||||
ok()
|
||||
|
||||
proc requestRecycle*[T](
|
||||
ctx: ptr FFIContext[T], callback: FFICallBack, userData: pointer
|
||||
): Result[void, string] =
|
||||
## Starts the context's recycle WITHOUT stopping its worker threads, so the
|
||||
## next createFFIContext reuses the same threads, signals and kqueue fds.
|
||||
## The FFI thread loop drains the in-flight handlers, frees the lib, clears the
|
||||
## per-context state and releases the slot, then fires `callback`
|
||||
## (RET_OK drained, RET_ERR stuck). Non-blocking.
|
||||
ctx.lock.acquire()
|
||||
if ctx.lifecycle.load() != CtxLifecycle.Active:
|
||||
ctx.lock.release()
|
||||
return err("requestRecycle: context is not Active (already recycling)")
|
||||
ctx.recycleCallback = callback
|
||||
ctx.recycleUserData = userData
|
||||
ctx.lifecycle.store(CtxLifecycle.RecyclePending)
|
||||
ctx.lock.release()
|
||||
|
||||
let fired = ctx.reqSignal.fireSync().valueOr:
|
||||
return err("requestRecycle: failed to signal the FFI thread: " & $error)
|
||||
if not fired:
|
||||
return err("requestRecycle: failed to signal the FFI thread in time")
|
||||
ok()
|
||||
|
||||
@ -1,56 +1,76 @@
|
||||
import std/atomics
|
||||
import results
|
||||
import ./ffi_context
|
||||
import ./ffi_context, ./ffi_types
|
||||
|
||||
const MaxFFIContexts* = 32
|
||||
# Only affects upfront pool memory; fds/threads consumed per acquired slot.
|
||||
## Maximum number of concurrently live FFI contexts when using FFIContextPool.
|
||||
## Each slot's threads, signals and dispatcher kqueue fds are created once and
|
||||
## reused, so this also caps the process's steady-state fd usage.
|
||||
|
||||
type FFIContextPool*[T] = object
|
||||
## Fixed pool. Bounds ThreadSignalPtr fds at MaxFFIContexts * 2.
|
||||
slots: array[MaxFFIContexts, FFIContext[T]]
|
||||
inUse: array[MaxFFIContexts, Atomic[bool]]
|
||||
|
||||
proc acquireSlot[T](pool: var FFIContextPool[T]): Result[ptr FFIContext[T], string] =
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
var expected = false
|
||||
if pool.inUse[i].compareExchange(expected, true):
|
||||
return ok(pool.slots[i].addr)
|
||||
err("FFI context pool exhausted (max " & $MaxFFIContexts & " contexts)")
|
||||
|
||||
proc releaseSlot[T](pool: var FFIContextPool[T], ctx: ptr FFIContext[T]) =
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
if pool.slots[i].addr == ctx:
|
||||
pool.inUse[i].store(false)
|
||||
return
|
||||
contexts: array[MaxFFIContexts, FFIContext[T]]
|
||||
initialized: array[MaxFFIContexts, Atomic[bool]]
|
||||
## Whether the slot's worker has been built. Once true it stays true for the
|
||||
## process lifetime — destroy recycles the context rather than tearing it down.
|
||||
|
||||
proc createFFIContext*[T](
|
||||
pool: var FFIContextPool[T]
|
||||
): Result[ptr FFIContext[T], string] =
|
||||
let ctx = pool.acquireSlot().valueOr:
|
||||
return err("createFFIContext: acquireSlot failed: " & $error)
|
||||
initContextResources(ctx).isOkOr:
|
||||
pool.releaseSlot(ctx)
|
||||
return err("createFFIContext: initContextResources failed: " & $error)
|
||||
ok(ctx)
|
||||
## Acquires a context from the fixed pool. The worker (threads + signals +
|
||||
## dispatcher kqueues) is built once on first use and REUSED on every later
|
||||
## acquisition — chronos never frees a dispatcher's kqueue fd, so a
|
||||
## thread-per-context model would leak fds unboundedly.
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
let ctx = pool.contexts[i].addr
|
||||
if not ctx.tryClaim():
|
||||
continue
|
||||
if pool.initialized[i].load():
|
||||
## Reused slot: a prior destroy drained and parked it; worker still alive.
|
||||
ctx.markAsActive()
|
||||
return ok(ctx)
|
||||
initContextResources(ctx).isOkOr:
|
||||
ctx.release()
|
||||
return err("createFFIContext: initContextResources failed: " & $error)
|
||||
pool.initialized[i].store(true)
|
||||
return ok(ctx)
|
||||
return err("FFI context pool exhausted (max " & $MaxFFIContexts & " contexts)")
|
||||
|
||||
proc releaseFFIContext*[T](
|
||||
ctx: ptr FFIContext[T], callback: FFICallBack, userData: pointer
|
||||
): Result[void, string] =
|
||||
## Recycle/park the context for reuse without stopping its worker threads.
|
||||
## `callback` fires once the FFI thread has drained and parked it.
|
||||
ctx.requestRecycle(callback, userData)
|
||||
|
||||
proc destroyFFIContext*[T](
|
||||
pool: var FFIContextPool[T], ctx: ptr FFIContext[T]
|
||||
): Result[void, string] =
|
||||
## On thread-exit timeout the slot is leaked — closing live-thread resources is unsafe.
|
||||
## Full teardown: stops/joins the worker threads and returns the context to the
|
||||
## pool, marking it uninitialised so a later createFFIContext rebuilds it. Only
|
||||
## for process/pool shutdown — normal destruction uses releaseFFIContext.
|
||||
ctx.stopAndJoinThreads().isOkOr:
|
||||
return err("destroyFFIContext(pool): " & $error)
|
||||
# Required: next acquisition would otherwise re-init a live lock (UB).
|
||||
# Close the ThreadSignalPtr fds and deinit the lock + event registry/queue
|
||||
# BEFORE the slot is marked for rebuild. createFFIContext's rebuild path reruns
|
||||
# initContextResources (initLock / initEventRegistry / initEventQueue + fresh
|
||||
# signals); skipping deinit here would re-init still-live locks (UB) and orphan
|
||||
# the old fds — the very leak this teardown exists to prevent.
|
||||
let deinitRes = ctx.deinitContextResources()
|
||||
pool.releaseSlot(ctx)
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
if pool.contexts[i].addr == ctx:
|
||||
pool.initialized[i].store(false)
|
||||
break
|
||||
ctx.release()
|
||||
deinitRes.isOkOr:
|
||||
return err("destroyFFIContext(pool): " & $error)
|
||||
ok()
|
||||
return ok()
|
||||
|
||||
proc isValidCtx*[T](pool: var FFIContextPool[T], ctx: pointer): bool =
|
||||
## Rejects nil / offset-invalid / dangling pointers at the API boundary.
|
||||
## True only if ctx points to one of the pool's contexts that is currently
|
||||
## claimed (in use). Rejects nil / dangling / parked contexts at the API boundary.
|
||||
if ctx.isNil():
|
||||
return false
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
if cast[pointer](pool.slots[i].addr) == ctx:
|
||||
return pool.inUse[i].load()
|
||||
false
|
||||
if cast[pointer](pool.contexts[i].addr) == ctx:
|
||||
return cast[ptr FFIContext[T]](ctx).isInUse()
|
||||
return false
|
||||
|
||||
@ -28,6 +28,13 @@ proc sendRequestToFFIThread*(
|
||||
defer:
|
||||
ctx.lock.release()
|
||||
|
||||
# Reject once recycling has been requested: the slot is being drained/parked
|
||||
# and its handler table/lib are about to be torn down. requestRecycle flips
|
||||
# this under the same lock, so the check is race-free.
|
||||
if ctx.lifecycle.load() != CtxLifecycle.Active:
|
||||
deleteRequest(ffiRequest)
|
||||
return err("FFI context is not accepting requests (being recycled)")
|
||||
|
||||
let sentOk = ctx.reqChannel.trySend(ffiRequest)
|
||||
if not sentOk:
|
||||
deleteRequest(ffiRequest)
|
||||
@ -42,10 +49,15 @@ proc sendRequestToFFIThread*(
|
||||
deleteRequest(ffiRequest)
|
||||
return err("Couldn't fireSync in time")
|
||||
|
||||
# The request is now owned by the FFI thread: it has been queued and the FFI
|
||||
# thread signaled, so it WILL be received, processed, and answered through the
|
||||
# callback via handleRes. The waitSync below only confirms prompt receipt; its
|
||||
# failure (e.g. EINVAL/EINTR under load or signal teardown) must NOT be
|
||||
# surfaced as an error, or the caller would fire the callback a SECOND time
|
||||
# while handleRes also fires it — a double callback onto a freed response.
|
||||
let res = ctx.reqReceivedSignal.waitSync(timeout)
|
||||
if res.isErr():
|
||||
# FFI thread was signaled and owns the request; don't double-free.
|
||||
return err("Couldn't receive reqReceivedSignal signal")
|
||||
return ok()
|
||||
|
||||
# On ok the FFI thread's processRequest deallocShared(req)'s.
|
||||
ok()
|
||||
@ -79,6 +91,75 @@ proc processRequest[T](
|
||||
except Exception as e:
|
||||
error "Unexpected exception in handleRes", error = e.msg
|
||||
|
||||
proc freeLib[T](ctx: ptr FFIContext[T]) {.gcsafe.} =
|
||||
## Frees the createShared'd library object built by the ctor. Logical resource
|
||||
## cleanup is the destructor body's job; this only releases the Nim storage.
|
||||
if ctx.myLib.isNil():
|
||||
return
|
||||
when not defined(gcRefc):
|
||||
{.cast(gcsafe).}:
|
||||
`=destroy`(ctx.myLib[])
|
||||
else:
|
||||
discard
|
||||
freeShared(ctx.myLib)
|
||||
ctx.myLib = nil
|
||||
|
||||
var RecycleTimeout* = 1500.milliseconds
|
||||
## Upper bound the recycle handler waits for in-flight handlers before it
|
||||
## cancels them and reports the ctx as stuck. Returns as soon as they finish,
|
||||
## so this only bounds a *stuck* handler. A `var` so tests can shorten it.
|
||||
|
||||
proc recycleContext[T](
|
||||
ctx: ptr FFIContext[T], ongoingProcessReq: ptr seq[Future[void]]
|
||||
) {.async.} =
|
||||
## Runs on the FFI thread. Drains in-flight handlers, frees the lib, clears the
|
||||
## per-context event state, releases the slot for reuse, and fires the recycle
|
||||
## callback. The worker threads (and their kqueue fds) stay alive for reuse.
|
||||
ongoingProcessReq[].keepItIf(not it.finished())
|
||||
|
||||
## 1. Let in-flight handlers finish on their own, bounded by RecycleTimeout.
|
||||
var naturallyDrained = ongoingProcessReq[].len == 0
|
||||
if not naturallyDrained:
|
||||
naturallyDrained = await allFutures(ongoingProcessReq[]).withTimeout(RecycleTimeout)
|
||||
|
||||
## 2. If any are wedged, cancel them and give the cancellations a bounded moment.
|
||||
var safeToRecycle = naturallyDrained
|
||||
if not naturallyDrained:
|
||||
for fut in ongoingProcessReq[]:
|
||||
if not fut.finished():
|
||||
fut.cancelSoon()
|
||||
safeToRecycle = await allFutures(ongoingProcessReq[]).withTimeout(RecycleTimeout)
|
||||
|
||||
let cb = ctx.recycleCallback
|
||||
let ud = ctx.recycleUserData
|
||||
ctx.recycleCallback = nil
|
||||
|
||||
if safeToRecycle:
|
||||
freeLib(ctx)
|
||||
# Reset per-context event state so a reused slot starts clean: drop listeners
|
||||
# and drain/free any queued events so they aren't delivered to the next owner.
|
||||
removeAllEventListeners(ctx[].eventRegistry)
|
||||
while true:
|
||||
let opt = ctx.eventQueue.tryDequeueEvent()
|
||||
if opt.isNone():
|
||||
break
|
||||
let qe = opt.get()
|
||||
freeEventBuffers(qe.name, qe.data)
|
||||
ctx.eventQueueStuck.store(false)
|
||||
ongoingProcessReq[].setLen(0)
|
||||
ctx.release()
|
||||
|
||||
if not cb.isNil():
|
||||
foreignThreadGc:
|
||||
let msg =
|
||||
if naturallyDrained:
|
||||
""
|
||||
else:
|
||||
"recycle: in-flight requests did not finish in time"
|
||||
let cmsg = msg.cstring
|
||||
let retCode = if naturallyDrained: RET_OK else: RET_ERR
|
||||
cb(retCode, unsafeAddr cmsg[0], cast[csize_t](msg.len), ud)
|
||||
|
||||
var ffiEventQueueSignalPtr {.threadvar.}: ThreadSignalPtr
|
||||
# Stashed so the hook has no closure env.
|
||||
|
||||
@ -123,6 +204,14 @@ proc ffiThreadBody[T](ctx: ptr FFIContext[T]) {.thread.} =
|
||||
pending.del(i)
|
||||
|
||||
while ctx.running.load():
|
||||
# A destructor requested recycle: drain + free + park this slot for reuse,
|
||||
# keeping the threads (and their kqueue fds) alive. Claim it atomically so
|
||||
# only this loop runs the recycle.
|
||||
var expectedRecycle = CtxLifecycle.RecyclePending
|
||||
if ctx.lifecycle.compareExchange(expectedRecycle, CtxLifecycle.Recycling):
|
||||
await recycleContext(ctx, addr pending)
|
||||
continue
|
||||
|
||||
# Freezes if a sync handler blocks the dispatcher; event thread reads to detect wedged FFI thread.
|
||||
discard ctx.ffiHeartbeat.fetchAdd(1)
|
||||
|
||||
|
||||
@ -147,6 +147,12 @@ macro declareLibrary*(libraryName: static[string], libType: untyped): untyped =
|
||||
let addName = libraryName & "_add_event_listener"
|
||||
let addErr = "error: invalid context in " & addName
|
||||
let addBody = quote:
|
||||
# Sets up the calling (foreign) thread's GC before any Nim allocation
|
||||
# below ($eventName / registry table+seq ops). Request entry procs do the
|
||||
# same; without it a foreign thread with no Nim heap segfaults in the
|
||||
# allocator under GC pressure.
|
||||
when declared(initializeLibrary):
|
||||
initializeLibrary()
|
||||
var ret: uint64 = 0
|
||||
if isNil(ctx):
|
||||
echo `addErr`
|
||||
@ -181,6 +187,10 @@ macro declareLibrary*(libraryName: static[string], libType: untyped): untyped =
|
||||
let removeName = libraryName & "_remove_event_listener"
|
||||
let removeErr = "error: invalid context in " & removeName
|
||||
let removeBody = quote:
|
||||
# See add_event_listener: removeEventListener mutates the registry's
|
||||
# GC-allocated table/seqs, so the calling foreign thread needs a GC.
|
||||
when declared(initializeLibrary):
|
||||
initializeLibrary()
|
||||
var ret: cint = 1
|
||||
if isNil(ctx):
|
||||
echo `removeErr`
|
||||
|
||||
@ -835,6 +835,14 @@ macro ffi*(prc: untyped): untyped =
|
||||
|
||||
let ffiBody = newStmtList()
|
||||
|
||||
# Set up the calling (foreign) thread's GC before any Nim allocation in
|
||||
# this wrapper (e.g. `$reqTypeName` below). Without it a foreign thread
|
||||
# with no Nim heap segfaults in the allocator under GC pressure. `ffiRaw`
|
||||
# already does this; the `.ffi.` path must too.
|
||||
ffiBody.add quote do:
|
||||
when declared(initializeLibrary):
|
||||
initializeLibrary()
|
||||
|
||||
ffiBody.add quote do:
|
||||
if callback.isNil:
|
||||
return RET_MISSING_CALLBACK
|
||||
@ -1336,23 +1344,25 @@ macro ffiCtor*(prc: untyped): untyped =
|
||||
return stmts
|
||||
|
||||
macro ffiDtor*(prc: untyped): untyped =
|
||||
## Defines a C-exported destructor that tears down the FFIContext after the
|
||||
## body runs.
|
||||
## Defines a C-exported destructor that RECYCLES the FFIContext after the
|
||||
## body runs — it parks the context for reuse instead of tearing down its
|
||||
## worker threads, because chronos never frees a dispatcher's kqueue fd, so a
|
||||
## thread-per-context teardown would leak fds unboundedly.
|
||||
##
|
||||
## The annotated proc must have exactly one parameter of the library type.
|
||||
## The body contains any library-level cleanup to run before context teardown.
|
||||
## The body contains any library-level cleanup to run before recycle.
|
||||
##
|
||||
## Example:
|
||||
## proc waku_destroy*(w: Waku) {.ffiDtor.} =
|
||||
## w.cleanup()
|
||||
##
|
||||
## The generated C-exported proc has the signature:
|
||||
## int waku_destroy(void* ctx)
|
||||
## int waku_destroy(void* ctx, FfiCallback callback, void* userData)
|
||||
##
|
||||
## It extracts the library value from ctx, runs the body, then calls
|
||||
## destroyFFIContext to tear down the FFI thread and free the context.
|
||||
## Returns RET_OK on success, RET_ERR on failure (null/invalid ctx, or
|
||||
## destroyFFIContext failure).
|
||||
## NON-BLOCKING: it runs the body, requests recycle, and returns RET_OK once
|
||||
## accepted; the real outcome (RET_OK drained / RET_ERR stuck) arrives via
|
||||
## `callback`. Returns RET_ERR synchronously only for a null/invalid ctx or a
|
||||
## rejected recycle request.
|
||||
|
||||
let procName = prc[0]
|
||||
let formalParams = prc[3]
|
||||
@ -1379,7 +1389,7 @@ macro ffiDtor*(prc: untyped): untyped =
|
||||
if procName.kind == nnkPostfix:
|
||||
cExportProcName = procName[1]
|
||||
|
||||
let destroyResIdent = genSym(nskLet, "destroyRes")
|
||||
let releaseResIdent = genSym(nskLet, "releaseRes")
|
||||
|
||||
let ffiBody = newStmtList()
|
||||
|
||||
@ -1389,6 +1399,9 @@ macro ffiDtor*(prc: untyped): untyped =
|
||||
|
||||
ffiBody.add quote do:
|
||||
if ctx.isNil or cast[ptr FFIContext[`libTypeName`]](ctx)[].myLib.isNil:
|
||||
if not callback.isNil:
|
||||
let errStr = "destroy: invalid context"
|
||||
callback(RET_ERR, unsafeAddr errStr[0], cast[csize_t](errStr.len), userData)
|
||||
return RET_ERR
|
||||
|
||||
ffiBody.add quote do:
|
||||
@ -1402,19 +1415,28 @@ macro ffiDtor*(prc: untyped): untyped =
|
||||
if not isNoop:
|
||||
ffiBody.add(bodyNode)
|
||||
|
||||
let poolIdent = ident($libTypeName & "FFIPool")
|
||||
ffiBody.add quote do:
|
||||
let `destroyResIdent` =
|
||||
`poolIdent`.destroyFFIContext(cast[ptr FFIContext[`libTypeName`]](ctx))
|
||||
if `destroyResIdent`.isErr():
|
||||
# Recycle (park) the context for reuse instead of tearing down its threads.
|
||||
# The callback fires from the recycle handler once drained.
|
||||
let `releaseResIdent` =
|
||||
releaseFFIContext(cast[ptr FFIContext[`libTypeName`]](ctx), callback, userData)
|
||||
if `releaseResIdent`.isErr():
|
||||
if not callback.isNil:
|
||||
let errStr = "destroy: " & `releaseResIdent`.error
|
||||
callback(RET_ERR, unsafeAddr errStr[0], cast[csize_t](errStr.len), userData)
|
||||
return RET_ERR
|
||||
|
||||
ffiBody.add quote do:
|
||||
return RET_OK
|
||||
|
||||
let poolIdent = ident($libTypeName & "FFIPool")
|
||||
|
||||
let ffiProc = newProc(
|
||||
name = postfix(cExportProcName, "*"),
|
||||
params = @[ident("cint"), newIdentDefs(ident("ctx"), ident("pointer"))],
|
||||
params = @[
|
||||
ident("cint"),
|
||||
newIdentDefs(ident("ctx"), ident("pointer")),
|
||||
newIdentDefs(ident("callback"), ident("FFICallBack")),
|
||||
newIdentDefs(ident("userData"), ident("pointer")),
|
||||
],
|
||||
body = ffiBody,
|
||||
pragmas = newTree(
|
||||
nnkPragma,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user