mirror of
https://github.com/logos-messaging/nim-ffi.git
synced 2026-06-22 09:20:13 +00:00
fix(pool): recycle contexts to stop per-cycle fd leak (#74)
This commit is contained in:
parent
c2ea4f3f98
commit
24cf796fcd
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@ -2,9 +2,9 @@ name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master, main]
|
||||
branches: [master, main, 'release/*']
|
||||
pull_request:
|
||||
branches: [master, main]
|
||||
branches: [master, main, 'release/*']
|
||||
|
||||
jobs:
|
||||
# Single source of truth for Nim / Nimble versions used by every job and
|
||||
|
||||
@ -6,11 +6,25 @@
|
||||
|
||||
{.passc: "-fPIC".}
|
||||
|
||||
import std/[atomics, locks, options, tables]
|
||||
import std/[atomics, locks, options, tables, sequtils]
|
||||
import chronicles, chronos, chronos/threadsync, taskpools/channels_spsc_single, results
|
||||
import ./ffi_types, ./ffi_events, ./ffi_thread_request, ./logging, ./cbor_serial
|
||||
import ./ffi_types, ./ffi_thread_request, ./internal/ffi_macro, ./logging
|
||||
|
||||
export ffi_events
|
||||
type FFICallbackState* = object
|
||||
## Holds the C event callback and its associated user-data pointer.
|
||||
## Embedded in FFIContext and referenced from the FFI thread via a thread-local.
|
||||
callback*: pointer
|
||||
userData*: pointer
|
||||
|
||||
type CtxLifecycle {.pure.} = enum
|
||||
## State machine guarding a pooled FFI context, held as an Atomic on FFIContext.
|
||||
## Transitions:
|
||||
## Active -> RecyclePending when ffiDtor is invoked
|
||||
## RecyclePending -> Recycling The process completed the in-flight processes and is ready for lib cleanup and release
|
||||
## Recycling -> Active When the FFI thread is ready again to attend to requests
|
||||
Active ## accepting and serving requests
|
||||
RecyclePending ## recycle requested; FFI thread loop hasn't claimed it yet
|
||||
Recycling ## FFI loop draining handlers, then frees lib + returns to pool
|
||||
|
||||
type FFIContext*[T] = object
|
||||
myLib*: ptr T # main library object (Waku, LibP2P, SDS, …)
|
||||
@ -32,6 +46,15 @@ type FFIContext*[T] = object
|
||||
# advanced each FFI-thread loop; event thread reads for liveness
|
||||
eventQueueStuck*: Atomic[bool] # sticky overflow flag
|
||||
running: Atomic[bool] # To control when the threads are running
|
||||
lifecycle: Atomic[CtxLifecycle]
|
||||
recycleCallback: FFICallBack
|
||||
# The destructor's callback, fired by the recycle handler with the outcome:
|
||||
# RET_OK once drained, RET_ERR if it timed out. Set by requestRecycle.
|
||||
recycleUserData: pointer
|
||||
inUse: Atomic[bool]
|
||||
# Whether the context is claimed. createFFIContext claims it (false -> true); the
|
||||
# recycle handler clears it once drained. On the context so the owning thread can
|
||||
# release it without reaching into the pool.
|
||||
registeredRequests: ptr Table[cstring, FFIRequestProc]
|
||||
|
||||
var onFFIThread* {.threadvar.}: bool
|
||||
@ -55,6 +78,285 @@ template closeAndNil(field: untyped) =
|
||||
proc deinitContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
## Mirror of `initContextResources`. Threads MUST be joined first;
|
||||
## fields are nil'd after close so re-init on the same slot is safe.
|
||||
template callEventCallback*(ctx: ptr FFIContext, eventName: string, body: untyped) =
|
||||
if isNil(ctx[].callbackState.callback):
|
||||
chronicles.error eventName & " - eventCallback is nil"
|
||||
return
|
||||
|
||||
foreignThreadGc:
|
||||
try:
|
||||
let event = body
|
||||
cast[FFICallBack](ctx[].callbackState.callback)(
|
||||
RET_OK,
|
||||
unsafeAddr event[0],
|
||||
cast[csize_t](len(event)),
|
||||
ctx[].callbackState.userData,
|
||||
)
|
||||
except Exception, CatchableError:
|
||||
let msg =
|
||||
"Exception " & eventName & " when calling 'eventCallBack': " &
|
||||
getCurrentExceptionMsg()
|
||||
cast[FFICallBack](ctx[].callbackState.callback)(
|
||||
RET_ERR,
|
||||
unsafeAddr msg[0],
|
||||
cast[csize_t](len(msg)),
|
||||
ctx[].callbackState.userData,
|
||||
)
|
||||
|
||||
template dispatchFfiEvent*(eventName: string, body: untyped) =
|
||||
## Dispatches an FFI event to the callback registered via `{libName}_set_event_callback`.
|
||||
## `body` is evaluated lazily — only when a callback is registered.
|
||||
## Valid only on the FFI thread (i.e., inside {.ffi.} proc bodies and their async closures).
|
||||
let ffiState = ffiCurrentCallbackState
|
||||
if isNil(ffiState) or isNil(ffiState[].callback):
|
||||
chronicles.error eventName & " - event callback not set"
|
||||
return
|
||||
foreignThreadGc:
|
||||
try:
|
||||
let event = body
|
||||
cast[FFICallBack](ffiState[].callback)(
|
||||
RET_OK, unsafeAddr event[0], cast[csize_t](len(event)), ffiState[].userData
|
||||
)
|
||||
except Exception, CatchableError:
|
||||
let msg = "Exception dispatching " & eventName & ": " & getCurrentExceptionMsg()
|
||||
cast[FFICallBack](ffiState[].callback)(
|
||||
RET_ERR, unsafeAddr msg[0], cast[csize_t](len(msg)), ffiState[].userData
|
||||
)
|
||||
|
||||
proc sendRequestToFFIThread*(
|
||||
ctx: ptr FFIContext, ffiRequest: ptr FFIThreadRequest, timeout = InfiniteDuration
|
||||
): Result[void, string] =
|
||||
ctx.lock.acquire()
|
||||
defer:
|
||||
ctx.lock.release()
|
||||
|
||||
if ctx.lifecycle.load() != CtxLifecycle.Active:
|
||||
deleteRequest(ffiRequest)
|
||||
return err("FFI context is not accepting requests (being recycled)")
|
||||
|
||||
## Sending the request to the FFI thread
|
||||
let sentOk = ctx.reqChannel.trySend(ffiRequest)
|
||||
if not sentOk:
|
||||
deleteRequest(ffiRequest)
|
||||
return err("Couldn't send a request to the ffi thread")
|
||||
|
||||
let fireSyncRes = ctx.reqSignal.fireSync()
|
||||
if fireSyncRes.isErr():
|
||||
deleteRequest(ffiRequest)
|
||||
return err("failed fireSync: " & $fireSyncRes.error)
|
||||
|
||||
if fireSyncRes.get() == false:
|
||||
deleteRequest(ffiRequest)
|
||||
return err("Couldn't fireSync in time")
|
||||
|
||||
## wait until the FFI working thread properly received the request
|
||||
let res = ctx.reqReceivedSignal.waitSync(timeout)
|
||||
if res.isErr():
|
||||
return err("Couldn't receive reqReceivedSignal signal")
|
||||
|
||||
return ok()
|
||||
|
||||
type Foo = object
|
||||
registerReqFFI(WatchdogReq, foo: ptr Foo):
|
||||
proc(): Future[Result[string, string]] {.async.} =
|
||||
return ok("FFI thread is not blocked")
|
||||
|
||||
type JsonNotRespondingEvent = object
|
||||
eventType: string
|
||||
|
||||
proc init(T: type JsonNotRespondingEvent): T =
|
||||
return JsonNotRespondingEvent(eventType: "not_responding")
|
||||
|
||||
proc `$`(event: JsonNotRespondingEvent): string =
|
||||
$(%*event)
|
||||
|
||||
proc onNotResponding*(ctx: ptr FFIContext) =
|
||||
callEventCallback(ctx, "onNotResponding"):
|
||||
$JsonNotRespondingEvent.init()
|
||||
|
||||
proc watchdogThreadBody(ctx: ptr FFIContext) {.thread.} =
|
||||
## Watchdog thread that monitors the FFI thread and notifies the library user if it hangs.
|
||||
## This thread never blocks.
|
||||
|
||||
let watchdogRun = proc(ctx: ptr FFIContext) {.async.} =
|
||||
const WatchdogStartDelay = 10.seconds
|
||||
const WatchdogTimeinterval = 1.seconds
|
||||
const WatchdogTimeout = 20.seconds
|
||||
|
||||
# Give time for the node to be created and up before sending watchdog requests
|
||||
let initialStop = await ctx.stopSignal.wait().withTimeout(WatchdogStartDelay)
|
||||
if initialStop or ctx.running.load == false:
|
||||
return
|
||||
|
||||
while true:
|
||||
let intervalStop = await ctx.stopSignal.wait().withTimeout(WatchdogTimeinterval)
|
||||
|
||||
if intervalStop or ctx.running.load == false:
|
||||
debug "Watchdog thread exiting because FFIContext is not running"
|
||||
break
|
||||
|
||||
if ctx.lifecycle.load() != CtxLifecycle.Active:
|
||||
continue
|
||||
|
||||
let callback = proc(
|
||||
callerRet: cint, msg: ptr cchar, len: csize_t, userData: pointer
|
||||
) {.cdecl, gcsafe, raises: [].} =
|
||||
discard ## Don't do anything. Just respecting the callback signature.
|
||||
const nilUserData = nil
|
||||
|
||||
trace "Sending watchdog request to FFI thread"
|
||||
|
||||
try:
|
||||
sendRequestToFFIThread(
|
||||
ctx, WatchdogReq.ffiNewReq(callback, nilUserData), WatchdogTimeout
|
||||
).isOkOr:
|
||||
error "Failed to send watchdog request to FFI thread", error = $error
|
||||
onNotResponding(ctx)
|
||||
except Exception as exc:
|
||||
error "Exception sending watchdog request", exc = exc.msg
|
||||
onNotResponding(ctx)
|
||||
|
||||
waitFor watchdogRun(ctx)
|
||||
|
||||
proc processRequest[T](
|
||||
request: ptr FFIThreadRequest, ctx: ptr FFIContext[T]
|
||||
) {.async.} =
|
||||
## Invoked within the FFI thread to process a request coming from the FFI API consumer thread.
|
||||
|
||||
let reqId = $request[].reqId
|
||||
## The reqId determines which proc will handle the request.
|
||||
## The registeredRequests represents a table defined at compile time.
|
||||
## Then, registeredRequests == Table[reqId, proc-handling-the-request-asynchronously]
|
||||
|
||||
let reqIdCs = reqId.cstring
|
||||
# keep `reqId` alive and avoid the implicit string→cstring warning.
|
||||
|
||||
let retFut =
|
||||
if not ctx[].registeredRequests[].contains(reqIdCs):
|
||||
## That shouldn't happen because only registered requests should be sent to the FFI thread.
|
||||
nilProcess(request[].reqId)
|
||||
else:
|
||||
ctx[].registeredRequests[][reqIdCs](request[].reqContent, ctx)
|
||||
|
||||
let res =
|
||||
try:
|
||||
await retFut
|
||||
except CancelledError as exc:
|
||||
Result[string, string].err("Request cancelled during destroy: " & exc.msg)
|
||||
except AsyncError as exc:
|
||||
Result[string, string].err(
|
||||
"Async error in processRequest for " & reqId & ": " & exc.msg
|
||||
)
|
||||
|
||||
## handleRes may raise (OOM, GC setup) even though it is rare.
|
||||
try:
|
||||
handleRes(res, request)
|
||||
except Exception as exc:
|
||||
error "Unexpected exception in handleRes", error = exc.msg
|
||||
|
||||
proc freeLib[T](ctx: ptr FFIContext[T]) {.gcsafe.} =
|
||||
if ctx.myLib.isNil():
|
||||
return
|
||||
|
||||
when not defined(gcRefc):
|
||||
{.cast(gcsafe).}:
|
||||
`=destroy`(ctx.myLib[])
|
||||
else:
|
||||
discard
|
||||
freeShared(ctx.myLib)
|
||||
ctx.myLib = nil
|
||||
|
||||
var RecycleTimeout* = 1500.milliseconds
|
||||
## Upper bound the recycle handler waits for in-flight handlers before it
|
||||
## cancels them and reports the ctx as stuck. The drain returns as soon as they
|
||||
## finish, so this only bounds a *stuck* handler. A `var` so tests can shorten it.
|
||||
|
||||
proc recycleContext[T](
|
||||
ctx: ptr FFIContext[T], ongoingProcessReq: ptr seq[Future[void]]
|
||||
) {.async.} =
|
||||
## Drain the in-flight handlers, free the lib object, release the context for reuse,
|
||||
## and fire the callback with the outcome. Never blocks the caller.
|
||||
|
||||
ongoingProcessReq[].keepItIf(not it.finished())
|
||||
|
||||
## 1. Let the in-flight handlers finish on their own, bounded by RecycleTimeout.
|
||||
var naturallyDrained = ongoingProcessReq[].len == 0
|
||||
if not naturallyDrained:
|
||||
naturallyDrained = await allFutures(ongoingProcessReq[]).withTimeout(RecycleTimeout)
|
||||
|
||||
## 2. If any are wedged, cancel them and give the cancellations a bounded moment
|
||||
## to unwind, so the context can be reclaimed rather than leaked.
|
||||
var safeToRecycle = naturallyDrained
|
||||
if not naturallyDrained:
|
||||
for fut in ongoingProcessReq[]:
|
||||
if not fut.finished():
|
||||
fut.cancelSoon()
|
||||
safeToRecycle = await allFutures(ongoingProcessReq[]).withTimeout(RecycleTimeout)
|
||||
|
||||
let cb = ctx.recycleCallback
|
||||
let ud = ctx.recycleUserData
|
||||
ctx.recycleCallback = nil
|
||||
|
||||
if safeToRecycle:
|
||||
freeLib(ctx)
|
||||
ctx.callbackState = default(FFICallbackState)
|
||||
ongoingProcessReq[].setLen(0)
|
||||
ctx.release()
|
||||
|
||||
if not cb.isNil():
|
||||
foreignThreadGc:
|
||||
let msg =
|
||||
if naturallyDrained:
|
||||
""
|
||||
else:
|
||||
"recycle: in-flight requests did not finish in time"
|
||||
let cmsg = msg.cstring
|
||||
let retCode = if naturallyDrained: RET_OK else: RET_ERR
|
||||
cb(retCode, unsafeAddr cmsg[0], cast[csize_t](msg.len), ud)
|
||||
|
||||
proc ffiThreadBody[T](ctx: ptr FFIContext[T]) {.thread.} =
|
||||
## FFI thread body that attends library user API requests
|
||||
ffiCurrentCallbackState = addr ctx[].callbackState
|
||||
|
||||
logging.setupLog(logging.LogLevel.DEBUG, logging.LogFormat.TEXT)
|
||||
|
||||
defer:
|
||||
let fireRes = ctx.threadExitSignal.fireSync()
|
||||
if fireRes.isErr():
|
||||
error "failed to fire threadExitSignal on FFI thread exit", err = fireRes.error
|
||||
|
||||
let ffiRun = proc(ctx: ptr FFIContext[T]) {.async.} =
|
||||
|
||||
var ongoingProcessReq: seq[Future[void]]
|
||||
|
||||
while ctx.running.load():
|
||||
var expected = CtxLifecycle.RecyclePending
|
||||
if ctx.lifecycle.compareExchange(expected, CtxLifecycle.Recycling):
|
||||
await recycleContext(ctx, addr ongoingProcessReq)
|
||||
continue
|
||||
|
||||
let gotSignal = await ctx.reqSignal.wait().withTimeout(100.milliseconds)
|
||||
if not gotSignal:
|
||||
continue
|
||||
|
||||
## Wait for a request from the ffi consumer thread
|
||||
var request: ptr FFIThreadRequest
|
||||
if not ctx.reqChannel.tryRecv(request):
|
||||
continue
|
||||
|
||||
ongoingProcessReq.keepItIf(not it.finished())
|
||||
ongoingProcessReq.add(processRequest(request, ctx))
|
||||
|
||||
let fireRes = ctx.reqReceivedSignal.fireSync()
|
||||
if fireRes.isErr():
|
||||
error "could not fireSync back to requester thread", error = fireRes.error
|
||||
|
||||
waitFor ffiRun(ctx)
|
||||
|
||||
proc cleanUpResources[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
## Full cleanup for heap-allocated contexts: closes all resources and frees memory.
|
||||
defer:
|
||||
freeShared(ctx)
|
||||
ctx.lock.deinitLock()
|
||||
deinitEventRegistry(ctx[].eventRegistry)
|
||||
deinitEventQueue(ctx[].eventQueue)
|
||||
@ -115,6 +417,7 @@ proc initContextResources*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
|
||||
ctx.registeredRequests = addr ffi_types.registeredRequests
|
||||
|
||||
ctx.lifecycle.store(CtxLifecycle.Active)
|
||||
ctx.running.store(true)
|
||||
|
||||
try:
|
||||
@ -180,10 +483,41 @@ proc stopAndJoinThreads*[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
joinThread(ctx.eventThread)
|
||||
ok()
|
||||
|
||||
proc clearContext[T](ctx: ptr FFIContext[T]): Result[void, string] =
|
||||
## Stops a heap-allocated FFI context.
|
||||
ctx.stopAndJoinThreads().isOkOr:
|
||||
return err("clearContext: " & $error)
|
||||
ctx.cleanUpResources().isOkOr:
|
||||
return err("cleanUpResources failed: " & $error)
|
||||
ok()
|
||||
proc requestRecycle*[T](
|
||||
ctx: ptr FFIContext[T], callback: FFICallBack, userData: pointer
|
||||
): Result[void, string] =
|
||||
## Starts ctx recycle process without stopping its worker, so the next
|
||||
## createFFIContext reuses the same threads and fds.
|
||||
##
|
||||
## During recycling, the FFI thread drains the handlers, frees the lib and releases
|
||||
## the context, then fires `callback` (RET_OK drained, RET_ERR stuck).
|
||||
|
||||
ctx.lock.acquire()
|
||||
if ctx.lifecycle.load() != CtxLifecycle.Active:
|
||||
ctx.lock.release()
|
||||
return err("requestRecycle: context is not Active (already recycling)")
|
||||
|
||||
ctx.recycleCallback = callback
|
||||
ctx.recycleUserData = userData
|
||||
ctx.lifecycle.store(CtxLifecycle.RecyclePending)
|
||||
ctx.lock.release()
|
||||
|
||||
let fired = ctx.reqSignal.fireSync().valueOr:
|
||||
return err("requestRecycle: failed to signal the FFI thread: " & $error)
|
||||
if not fired:
|
||||
return err("requestRecycle: failed to signal the FFI thread in time")
|
||||
return ok()
|
||||
|
||||
proc markAsActive*[T](ctx: ptr FFIContext[T]) =
|
||||
ctx.lifecycle.store(CtxLifecycle.Active)
|
||||
|
||||
proc tryClaim*[T](ctx: ptr FFIContext[T]): bool =
|
||||
## Returns true if acquired the contex, false if it was already claimed.
|
||||
var expected = false
|
||||
ctx.inUse.compareExchange(expected, true)
|
||||
|
||||
proc release*[T](ctx: ptr FFIContext[T]) =
|
||||
ctx.inUse.store(false)
|
||||
|
||||
proc isInUse*[T](ctx: ptr FFIContext[T]): bool =
|
||||
ctx.inUse.load()
|
||||
|
||||
@ -1,56 +1,60 @@
|
||||
import std/atomics
|
||||
import results
|
||||
import ./ffi_context
|
||||
import ./ffi_context, ./ffi_types
|
||||
|
||||
const MaxFFIContexts* = 32
|
||||
# Only affects upfront pool memory; fds/threads consumed per acquired slot.
|
||||
## Maximum number of concurrently live FFI contexts when using FFIContextPool.
|
||||
|
||||
type FFIContextPool*[T] = object
|
||||
## Fixed pool. Bounds ThreadSignalPtr fds at MaxFFIContexts * 2.
|
||||
slots: array[MaxFFIContexts, FFIContext[T]]
|
||||
inUse: array[MaxFFIContexts, Atomic[bool]]
|
||||
|
||||
proc acquireSlot[T](pool: var FFIContextPool[T]): Result[ptr FFIContext[T], string] =
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
var expected = false
|
||||
if pool.inUse[i].compareExchange(expected, true):
|
||||
return ok(pool.slots[i].addr)
|
||||
err("FFI context pool exhausted (max " & $MaxFFIContexts & " contexts)")
|
||||
|
||||
proc releaseSlot[T](pool: var FFIContextPool[T], ctx: ptr FFIContext[T]) =
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
if pool.slots[i].addr == ctx:
|
||||
pool.inUse[i].store(false)
|
||||
return
|
||||
contexts: array[MaxFFIContexts, FFIContext[T]]
|
||||
initialized: array[MaxFFIContexts, Atomic[bool]]
|
||||
|
||||
proc createFFIContext*[T](
|
||||
pool: var FFIContextPool[T]
|
||||
): Result[ptr FFIContext[T], string] =
|
||||
let ctx = pool.acquireSlot().valueOr:
|
||||
return err("createFFIContext: acquireSlot failed: " & $error)
|
||||
initContextResources(ctx).isOkOr:
|
||||
pool.releaseSlot(ctx)
|
||||
return err("createFFIContext: initContextResources failed: " & $error)
|
||||
ok(ctx)
|
||||
## Acquires a context from the fixed pool. The context's worker is built once on
|
||||
## first use and reused on every later acquisition.
|
||||
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
let ctx = pool.contexts[i].addr
|
||||
if not ctx.tryClaim():
|
||||
continue
|
||||
if pool.initialized[i].load():
|
||||
## Reused context: a prior destroy drained and released it, worker still alive.
|
||||
ctx.markAsActive()
|
||||
return ok(ctx)
|
||||
initContextResources(ctx).isOkOr:
|
||||
ctx.release()
|
||||
return err("createFFIContext: initContextResources failed: " & $error)
|
||||
pool.initialized[i].store(true)
|
||||
return ok(ctx)
|
||||
return err("FFI context pool exhausted (max " & $MaxFFIContexts & " contexts)")
|
||||
|
||||
proc releaseFFIContext*[T](
|
||||
ctx: ptr FFIContext[T], callback: FFICallBack, userData: pointer
|
||||
): Result[void, string] =
|
||||
return ctx.requestRecycle(callback, userData)
|
||||
|
||||
proc destroyFFIContext*[T](
|
||||
pool: var FFIContextPool[T], ctx: ptr FFIContext[T]
|
||||
): Result[void, string] =
|
||||
## On thread-exit timeout the slot is leaked — closing live-thread resources is unsafe.
|
||||
## Full teardown: stops/joins the worker threads and returns the context to the
|
||||
## pool, marking it uninitialised so a later createFFIContext rebuilds it.
|
||||
ctx.stopAndJoinThreads().isOkOr:
|
||||
return err("destroyFFIContext(pool): " & $error)
|
||||
# Required: next acquisition would otherwise re-init a live lock (UB).
|
||||
let deinitRes = ctx.deinitContextResources()
|
||||
pool.releaseSlot(ctx)
|
||||
deinitRes.isOkOr:
|
||||
return err("destroyFFIContext(pool): " & $error)
|
||||
ok()
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
if pool.contexts[i].addr == ctx:
|
||||
pool.initialized[i].store(false)
|
||||
break
|
||||
ctx.release()
|
||||
return ok()
|
||||
|
||||
proc isValidCtx*[T](pool: var FFIContextPool[T], ctx: pointer): bool =
|
||||
## Rejects nil / offset-invalid / dangling pointers at the API boundary.
|
||||
## Returns true only if ctx points to one of the pool's contexts that is
|
||||
## currently in use.
|
||||
if ctx.isNil():
|
||||
return false
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
if cast[pointer](pool.slots[i].addr) == ctx:
|
||||
return pool.inUse[i].load()
|
||||
false
|
||||
if cast[pointer](pool.contexts[i].addr) == ctx:
|
||||
return cast[ptr FFIContext[T]](ctx).isInUse()
|
||||
return false
|
||||
|
||||
@ -1253,16 +1253,15 @@ macro ffiDtor*(prc: untyped): untyped =
|
||||
## The body contains any library-level cleanup to run before context teardown.
|
||||
##
|
||||
## Example:
|
||||
## proc waku_destroy*(w: Waku) {.ffiDtor.} =
|
||||
## w.cleanup()
|
||||
## proc mylibobj_destroy*(obj: MyLibObj) {.ffiDtor.} =
|
||||
## obj.cleanup()
|
||||
##
|
||||
## The generated C-exported proc has the signature:
|
||||
## int waku_destroy(void* ctx)
|
||||
## cint mylibobj_destroy(void* ctx, FfiCallback callback, void* userData)
|
||||
##
|
||||
## It extracts the library value from ctx, runs the body, then calls
|
||||
## destroyFFIContext to tear down the FFI thread and free the context.
|
||||
## Returns RET_OK on success, RET_ERR on failure (null/invalid ctx, or
|
||||
## destroyFFIContext failure).
|
||||
## Recycle the context for reuse to keep fd usage bounded.
|
||||
## NON-BLOCKING: returns RET_OK once accepted;
|
||||
## the real outcome arrives via `callback`.
|
||||
|
||||
let procName = prc[0]
|
||||
let formalParams = prc[3]
|
||||
@ -1271,8 +1270,8 @@ macro ffiDtor*(prc: untyped): untyped =
|
||||
if formalParams.len < 2:
|
||||
error("ffiDtor: proc must have exactly one parameter (w: LibType)")
|
||||
|
||||
let libParamName = formalParams[1][0]
|
||||
let libTypeName = formalParams[1][1]
|
||||
let libParamName = formalParams[1][0] # e.g. w
|
||||
let libTypeName = formalParams[1][1] # e.g. MyLibObj
|
||||
|
||||
let procNameStr = block:
|
||||
let raw = $procName
|
||||
@ -1289,7 +1288,7 @@ macro ffiDtor*(prc: untyped): untyped =
|
||||
if procName.kind == nnkPostfix:
|
||||
cExportProcName = procName[1]
|
||||
|
||||
let destroyResIdent = genSym(nskLet, "destroyRes")
|
||||
let releaseResIdent = genSym(nskLet, "destroyRes")
|
||||
|
||||
let ffiBody = newStmtList()
|
||||
|
||||
@ -1314,12 +1313,14 @@ macro ffiDtor*(prc: untyped): untyped =
|
||||
|
||||
let poolIdent = ident($libTypeName & "FFIPool")
|
||||
ffiBody.add quote do:
|
||||
let `destroyResIdent` =
|
||||
`poolIdent`.destroyFFIContext(cast[ptr FFIContext[`libTypeName`]](ctx))
|
||||
if `destroyResIdent`.isErr():
|
||||
let `releaseResIdent` = releaseFFIContext(
|
||||
cast[ptr FFIContext[`libTypeName`]](ctx), callback, userData
|
||||
)
|
||||
if `releaseResIdent`.isErr():
|
||||
if not callback.isNil():
|
||||
let errStr = "release failed: " & $`releaseResIdent`.error
|
||||
callback(RET_ERR, unsafeAddr errStr[0], cast[csize_t](errStr.len), userData)
|
||||
return RET_ERR
|
||||
|
||||
ffiBody.add quote do:
|
||||
return RET_OK
|
||||
|
||||
let ffiProc = newProc(
|
||||
|
||||
892
tests/test_ffi_context.nim
Normal file
892
tests/test_ffi_context.nim
Normal file
@ -0,0 +1,892 @@
|
||||
import std/[locks, strutils, os, osproc, sequtils]
|
||||
import unittest2
|
||||
import results
|
||||
import ../ffi
|
||||
|
||||
type TestLib = object
|
||||
|
||||
## Per-request callback state. The test thread blocks on `cond` until the
|
||||
## FFI thread signals it — no polling, no CPU waste.
|
||||
type CallbackData = object
|
||||
lock: Lock
|
||||
cond: Cond
|
||||
called: bool
|
||||
retCode: cint
|
||||
msg: array[512, char]
|
||||
msgLen: int
|
||||
|
||||
proc initCallbackData(d: var CallbackData) =
|
||||
d.lock.initLock()
|
||||
d.cond.initCond()
|
||||
|
||||
proc deinitCallbackData(d: var CallbackData) =
|
||||
d.cond.deinitCond()
|
||||
d.lock.deinitLock()
|
||||
|
||||
proc testCallback(
|
||||
retCode: cint, msg: ptr cchar, len: csize_t, userData: pointer
|
||||
) {.cdecl, gcsafe, raises: [].} =
|
||||
let d = cast[ptr CallbackData](userData)
|
||||
acquire(d[].lock)
|
||||
d[].retCode = retCode
|
||||
let n = min(int(len), d[].msg.high)
|
||||
if n > 0 and not msg.isNil:
|
||||
copyMem(addr d[].msg[0], msg, n)
|
||||
d[].msg[n] = '\0'
|
||||
d[].msgLen = n
|
||||
d[].called = true
|
||||
signal(d[].cond)
|
||||
release(d[].lock)
|
||||
|
||||
proc waitCallback(d: var CallbackData) =
|
||||
acquire(d.lock)
|
||||
while not d.called:
|
||||
wait(d.cond, d.lock)
|
||||
release(d.lock)
|
||||
|
||||
proc callbackMsg(d: var CallbackData): string =
|
||||
result = newString(d.msgLen)
|
||||
if d.msgLen > 0:
|
||||
copyMem(addr result[0], addr d.msg[0], d.msgLen)
|
||||
|
||||
registerReqFFI(PingRequest, lib: ptr TestLib):
|
||||
proc(message: cstring): Future[Result[string, string]] {.async.} =
|
||||
return ok("pong:" & $message)
|
||||
|
||||
registerReqFFI(FailRequest, lib: ptr TestLib):
|
||||
proc(): Future[Result[string, string]] {.async.} =
|
||||
return err("intentional failure")
|
||||
|
||||
registerReqFFI(EmptyOkRequest, lib: ptr TestLib):
|
||||
proc(): Future[Result[string, string]] {.async.} =
|
||||
return ok("")
|
||||
|
||||
registerReqFFI(SlowRequest, lib: ptr TestLib):
|
||||
proc(): Future[Result[string, string]] {.async.} =
|
||||
await sleepAsync(500.milliseconds)
|
||||
return ok("slow-done")
|
||||
|
||||
# Coordination channel: the FFI handler signals the test thread the instant
|
||||
# it is about to block the event loop, so the test can call destroyFFIContext
|
||||
# while the event loop is truly frozen.
|
||||
var gSyncBlockStarted: Channel[bool]
|
||||
gSyncBlockStarted.open()
|
||||
|
||||
registerReqFFI(SyncBlockingRequest, lib: ptr TestLib):
|
||||
proc(): Future[Result[string, string]] {.async.} =
|
||||
# Yield first so that reqReceivedSignal fires and sendRequestToFFIThread
|
||||
# returns on the calling thread before we start the synchronous block.
|
||||
await sleepAsync(0.milliseconds)
|
||||
# Signal the test thread: the event loop is about to be frozen.
|
||||
# Channel.send is annotated as raising under refc, so wrap.
|
||||
try:
|
||||
gSyncBlockStarted.send(true)
|
||||
except Exception as exc:
|
||||
return err("gSyncBlockStarted.send raised: " & exc.msg)
|
||||
# Simulates a request that blocks the event-loop thread synchronously
|
||||
# (e.g. w.stop() -> switch.stop() -> connManager.close() with blocking I/O).
|
||||
# Unlike sleepAsync, os.sleep holds the OS thread and prevents Chronos from
|
||||
# processing any callbacks -- including the reqSignal fired by destroyFFIContext.
|
||||
os.sleep(5_000)
|
||||
return ok("sync-blocking-done")
|
||||
|
||||
# Approximates the heavy ref-object workload that libwaku/libp2p performs on
|
||||
# the FFI thread. The exact cell count is large enough to force several refc
|
||||
# GC cycles; under refc this stresses the heap state that, when later combined
|
||||
# with a chronos Selector allocation on the main thread (via close()), used to
|
||||
# trip the rawNewObj → signal-handler infinite recursion.
|
||||
type RefCell = ref object
|
||||
next: RefCell
|
||||
payload: array[64, byte]
|
||||
|
||||
registerReqFFI(HeavyRefAllocRequest, lib: ptr TestLib):
|
||||
proc(): Future[Result[string, string]] {.async.} =
|
||||
var head: RefCell
|
||||
for i in 0 ..< 50_000:
|
||||
let n = RefCell(next: head)
|
||||
head = n
|
||||
if i mod 1000 == 0:
|
||||
await sleepAsync(0.milliseconds)
|
||||
# Break the chain iteratively before releasing head.
|
||||
# ORC's =destroy for RefCell recurses through .next, so a 50k-node chain
|
||||
# would produce ~50k nested =destroy calls and overflow the stack.
|
||||
# Walking the list and unlinking each node first keeps destruction O(n)
|
||||
# iterative instead of O(n) recursive.
|
||||
var node = head
|
||||
head = nil
|
||||
while not node.isNil():
|
||||
let nxt = node.next
|
||||
node.next = nil # unlink before the refcount of `node` can drop to zero
|
||||
node = nxt
|
||||
await sleepAsync(10.milliseconds)
|
||||
return ok("heavy-done")
|
||||
|
||||
suite "FFIContextPool":
|
||||
test "create and destroy via pool succeeds":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
assert false, "createFFIContext(pool) failed: " & $error
|
||||
return
|
||||
check pool.destroyFFIContext(ctx).isOk()
|
||||
|
||||
test "context is reused after destroy":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx1 = pool.createFFIContext().valueOr:
|
||||
assert false, "createFFIContext(pool) failed: " & $error
|
||||
return
|
||||
check pool.destroyFFIContext(ctx1).isOk()
|
||||
# After destroying, the same context must be available again
|
||||
let ctx2 = pool.createFFIContext().valueOr:
|
||||
assert false, "createFFIContext(pool) failed after context release: " & $error
|
||||
return
|
||||
check pool.destroyFFIContext(ctx2).isOk()
|
||||
check ctx1 == ctx2 # same context reused
|
||||
|
||||
test "pool exhaustion returns error":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
var ctxs: array[MaxFFIContexts, ptr FFIContext[TestLib]]
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
ctxs[i] = pool.createFFIContext().valueOr:
|
||||
for j in 0 ..< i:
|
||||
discard pool.destroyFFIContext(ctxs[j])
|
||||
assert false, "createFFIContext(pool) failed at context " & $i & ": " & $error
|
||||
return
|
||||
# Pool is now full — next create must fail
|
||||
check pool.createFFIContext().isErr()
|
||||
for i in 0 ..< MaxFFIContexts:
|
||||
discard pool.destroyFFIContext(ctxs[i])
|
||||
|
||||
test "requests are processed via pool context":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer:
|
||||
deinitCallbackData(d)
|
||||
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
assert false, "createFFIContext(pool) failed: " & $error
|
||||
return
|
||||
defer:
|
||||
discard pool.destroyFFIContext(ctx)
|
||||
|
||||
check sendRequestToFFIThread(
|
||||
ctx, PingRequest.ffiNewReq(testCallback, addr d, "pool".cstring)
|
||||
)
|
||||
.isOk()
|
||||
waitCallback(d)
|
||||
check d.retCode == RET_OK
|
||||
check callbackMsg(d) == "pong:pool"
|
||||
|
||||
suite "createFFIContext / destroyFFIContext":
|
||||
test "create and destroy succeeds":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
checkpoint "createFFIContext failed: " & $error
|
||||
check false
|
||||
return
|
||||
check pool.destroyFFIContext(ctx).isOk()
|
||||
|
||||
test "double destroy is safe via running flag":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
check pool.destroyFFIContext(ctx).isOk()
|
||||
|
||||
suite "destroyFFIContext does not hang":
|
||||
test "destroy while a slow async request is still in-flight":
|
||||
## Reproduces the race where destroyFFIContext was called while a long-
|
||||
## running async request (e.g. stop_node / w.stop()) was still executing.
|
||||
## The destroy must return well within 2 seconds; before the fix it would
|
||||
## block forever on joinThread(ffiThread).
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer: deinitCallbackData(d)
|
||||
|
||||
# sendRequestToFFIThread returns as soon as the FFI thread ACKs receipt;
|
||||
# the 500 ms work continues asynchronously on the FFI thread.
|
||||
check sendRequestToFFIThread(
|
||||
ctx, SlowRequest.ffiNewReq(testCallback, addr d)
|
||||
).isOk()
|
||||
|
||||
# Destroy immediately while SlowRequest is still running.
|
||||
let t0 = Moment.now()
|
||||
check pool.destroyFFIContext(ctx).isOk()
|
||||
check (Moment.now() - t0) < 2.seconds
|
||||
|
||||
suite "destroyFFIContext does not hang when event loop is blocked":
|
||||
test "destroy while sync-blocking request is in-flight":
|
||||
## Reproduces the hang seen in logosdelivery_example.c:
|
||||
## logosdelivery_stop_node(...) -- triggers w.stop() on the FFI thread
|
||||
## sleep(1)
|
||||
## logosdelivery_destroy(...) -- hangs forever
|
||||
##
|
||||
## Root cause: w.stop() (and similar tear-down calls) can execute a
|
||||
## synchronous blocking section that holds the OS thread, preventing
|
||||
## the Chronos event loop from processing the reqSignal fired by
|
||||
## destroyFFIContext. The result is joinThread(ffiThread) never returns.
|
||||
##
|
||||
## With the fix, destroyFFIContext must complete well within the 5 s that
|
||||
## SyncBlockingRequest holds the event loop.
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
|
||||
# CallbackData and ctx are kept alive past destroyFFIContext: the leaked
|
||||
# FFI thread is still inside os.sleep(5_000) and will eventually wake,
|
||||
# run handleRes, fire testCallback, and exit normally. We wait for that
|
||||
# to happen at the end of the test so the leaked thread cannot race with
|
||||
# subsequent tests' createFFIContext on Linux/Windows. Heap allocation
|
||||
# ensures the late callback's userData is still valid when it fires.
|
||||
let d = createShared(CallbackData)
|
||||
initCallbackData(d[])
|
||||
|
||||
check sendRequestToFFIThread(
|
||||
ctx, SyncBlockingRequest.ffiNewReq(testCallback, d)
|
||||
).isOk()
|
||||
|
||||
# Block until the FFI handler has signalled that os.sleep is about to start.
|
||||
# This guarantees destroyFFIContext is called while the event loop is frozen.
|
||||
discard gSyncBlockStarted.recv()
|
||||
|
||||
# Destroy must return promptly even though the event loop is frozen for 5s.
|
||||
# It deliberately returns err and leaks ctx in this scenario rather than
|
||||
# hanging on joinThread.
|
||||
let t0 = Moment.now()
|
||||
check pool.destroyFFIContext(ctx).isErr()
|
||||
check (Moment.now() - t0) < 3.seconds
|
||||
|
||||
# Drain the leaked thread before the test scope ends.
|
||||
# 1. waitCallback blocks until os.sleep(5_000) returns and handleRes
|
||||
# invokes testCallback (~3.5s after destroy returned), which proves
|
||||
# the leaked thread has reached the end of processRequest.
|
||||
# 2. Yield briefly so the thread can finish iterating its while loop,
|
||||
# fire threadExitSignal in its defer, and return. Without this, on
|
||||
# Linux/Windows the still-live thread can race with the next test's
|
||||
# createFFIContext under --mm:orc and segfault.
|
||||
# ctx.cleanUpResources is intentionally NOT called: destroyFFIContext
|
||||
# skipped it for a reason, and the signal fds are reclaimed by the OS
|
||||
# at process exit.
|
||||
waitCallback(d[])
|
||||
os.sleep(200)
|
||||
deinitCallbackData(d[])
|
||||
freeShared(d)
|
||||
|
||||
suite "destroyFFIContext refc workaround":
|
||||
## Documents the refc-specific workaround in cleanUpResources.
|
||||
##
|
||||
## Background: when the FFI thread does heavy ref-object work (the workload
|
||||
## that triggered the libwaku hang in production), the refc GC heap reaches
|
||||
## a state where the very first chronos Selector allocation on the *main*
|
||||
## thread — which happens lazily inside ThreadSignalPtr.close() through
|
||||
## getThreadDispatcher() — traps in rawNewObj. The refc signal handler
|
||||
## itself re-enters the same allocator and the process never returns.
|
||||
## Captured stack:
|
||||
## close → safeUnregisterAndCloseFd → getThreadDispatcher →
|
||||
## newDispatcher → Selector.new → newObj (gc.nim:488) → rawNewObj →
|
||||
## _sigtramp → signalHandler → newObjNoInit → addNewObjToZCT (loop)
|
||||
##
|
||||
## The workaround in cleanUpResources is `when defined(gcRefc): discard`,
|
||||
## i.e. skip the close() calls under refc only. orc is unaffected and
|
||||
## still cleans up the signal fds normally.
|
||||
##
|
||||
## NOTE: this test is documentation more than regression: a synthetic
|
||||
## ref-allocation workload of ~50k cells does NOT corrupt the refc heap
|
||||
## the way the real libwaku/libp2p teardown does, so this test passes
|
||||
## even when the workaround is disabled. Reproducing the actual hang
|
||||
## requires the full libwaku workload (logosdelivery_example.c).
|
||||
## Verification of the workaround was done end-to-end against that
|
||||
## example: with `--mm:refc` and close() enabled it hangs forever in
|
||||
## the captured stack above; with `when defined(gcRefc): discard` it
|
||||
## returns immediately. Under `--mm:orc` it returns immediately either
|
||||
## way.
|
||||
test "destroy after heavy ref-allocation workload returns promptly":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer: deinitCallbackData(d)
|
||||
|
||||
check sendRequestToFFIThread(
|
||||
ctx, HeavyRefAllocRequest.ffiNewReq(testCallback, addr d)
|
||||
).isOk()
|
||||
waitCallback(d)
|
||||
check d.retCode == RET_OK
|
||||
|
||||
let t0 = Moment.now()
|
||||
check pool.destroyFFIContext(ctx).isOk()
|
||||
check (Moment.now() - t0) < 3.seconds
|
||||
|
||||
suite "sendRequestToFFIThread":
|
||||
test "successful request triggers RET_OK callback":
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer:
|
||||
deinitCallbackData(d)
|
||||
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
defer:
|
||||
discard pool.destroyFFIContext(ctx)
|
||||
|
||||
check sendRequestToFFIThread(
|
||||
ctx, PingRequest.ffiNewReq(testCallback, addr d, "hello".cstring)
|
||||
)
|
||||
.isOk()
|
||||
waitCallback(d)
|
||||
check d.retCode == RET_OK
|
||||
check callbackMsg(d) == "pong:hello"
|
||||
|
||||
test "failing request triggers RET_ERR callback":
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer:
|
||||
deinitCallbackData(d)
|
||||
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
defer:
|
||||
discard pool.destroyFFIContext(ctx)
|
||||
|
||||
check sendRequestToFFIThread(ctx, FailRequest.ffiNewReq(testCallback, addr d)).isOk()
|
||||
waitCallback(d)
|
||||
check d.retCode == RET_ERR
|
||||
|
||||
test "empty ok response delivers empty message":
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer:
|
||||
deinitCallbackData(d)
|
||||
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
defer:
|
||||
discard pool.destroyFFIContext(ctx)
|
||||
|
||||
check sendRequestToFFIThread(ctx, EmptyOkRequest.ffiNewReq(testCallback, addr d))
|
||||
.isOk()
|
||||
waitCallback(d)
|
||||
check d.retCode == RET_OK
|
||||
check d.msgLen == 0
|
||||
|
||||
test "sequential requests are all processed":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
defer:
|
||||
discard pool.destroyFFIContext(ctx)
|
||||
|
||||
for i in 1 .. 5:
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
let msg = "msg" & $i
|
||||
check sendRequestToFFIThread(
|
||||
ctx, PingRequest.ffiNewReq(testCallback, addr d, msg.cstring)
|
||||
)
|
||||
.isOk()
|
||||
waitCallback(d)
|
||||
deinitCallbackData(d)
|
||||
check d.retCode == RET_OK
|
||||
check callbackMsg(d) == "pong:" & msg
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ffiCtor macro integration test
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
type SimpleLib = object
|
||||
value: int
|
||||
|
||||
ffiType:
|
||||
type SimpleConfig = object
|
||||
initialValue: int
|
||||
|
||||
proc testlib_create*(
|
||||
config: SimpleConfig
|
||||
): Future[Result[SimpleLib, string]] {.ffiCtor.} =
|
||||
return ok(SimpleLib(value: config.initialValue))
|
||||
|
||||
# Records the value of the library object the destructor body saw, so a test can
|
||||
# confirm the user cleanup body ran with the right lib state before teardown.
|
||||
var gDestroyedValue {.threadvar.}: int
|
||||
proc testlib_destroy*(lib: SimpleLib) {.ffiDtor.} =
|
||||
gDestroyedValue = lib.value
|
||||
|
||||
suite "ffiCtor macro":
|
||||
test "creates context and returns pointer via callback":
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer: deinitCallbackData(d)
|
||||
|
||||
let configJson = ffiSerialize(SimpleConfig(initialValue: 42))
|
||||
let ret = testlib_create(configJson.cstring, testCallback, addr d)
|
||||
|
||||
check not ret.isNil()
|
||||
|
||||
waitCallback(d)
|
||||
|
||||
check d.retCode == RET_OK
|
||||
|
||||
# The callback message is the ctx address as a decimal string
|
||||
let addrStr = callbackMsg(d)
|
||||
check addrStr.len > 0
|
||||
|
||||
let ctxAddr = cast[uint](parseBiggestUInt(addrStr))
|
||||
check ctxAddr != 0
|
||||
let ctx = cast[ptr FFIContext[SimpleLib]](ctxAddr)
|
||||
|
||||
# Verify the library was properly initialized
|
||||
check not ctx[].myLib.isNil
|
||||
check ctx[].myLib[].value == 42
|
||||
|
||||
check SimpleLibFFIPool.destroyFFIContext(ctx).isOk()
|
||||
|
||||
proc createSimpleLib(initialValue: int): ptr FFIContext[SimpleLib] =
|
||||
## Helper: run the generated async ctor and return the live ctx.
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer:
|
||||
deinitCallbackData(d)
|
||||
let ret =
|
||||
testlib_create(ffiSerialize(SimpleConfig(initialValue: initialValue)).cstring,
|
||||
testCallback, addr d)
|
||||
doAssert not ret.isNil()
|
||||
waitCallback(d)
|
||||
doAssert d.retCode == RET_OK
|
||||
return cast[ptr FFIContext[SimpleLib]](cast[uint](parseBiggestUInt(callbackMsg(d))))
|
||||
|
||||
suite "ffiDtor macro (async destroy + reuse)":
|
||||
test "destroy fires RET_OK after teardown, frees myLib, and frees the context":
|
||||
let ctx = createSimpleLib(5)
|
||||
check not ctx[].myLib.isNil
|
||||
check ctx[].myLib[].value == 5
|
||||
|
||||
var dD: CallbackData
|
||||
initCallbackData(dD)
|
||||
defer:
|
||||
deinitCallbackData(dD)
|
||||
|
||||
# Async destroy: the C return is just "accepted"; the real outcome arrives
|
||||
# via the callback once the FFI thread has finished tearing the lib down.
|
||||
check testlib_destroy(cast[pointer](ctx), testCallback, addr dD) == RET_OK
|
||||
waitCallback(dD)
|
||||
check dD.retCode == RET_OK
|
||||
check gDestroyedValue == 5 # the user cleanup body saw the live lib
|
||||
check ctx[].myLib.isNil() # freed on the FFI thread
|
||||
|
||||
# The context was freed from the FFI thread, so a fresh create reclaims it.
|
||||
let ctx2 = createSimpleLib(9)
|
||||
check ctx2 == ctx # same context, reused worker + fds
|
||||
check ctx2[].myLib[].value == 9
|
||||
check SimpleLibFFIPool.destroyFFIContext(ctx2).isOk()
|
||||
|
||||
test "destroy waits for an in-flight request before reporting RET_OK":
|
||||
let ctx = createSimpleLib(1)
|
||||
|
||||
# Dispatch a 500 ms handler and do NOT wait — it is in flight at destroy time.
|
||||
var slow: CallbackData
|
||||
initCallbackData(slow)
|
||||
defer:
|
||||
deinitCallbackData(slow)
|
||||
check sendRequestToFFIThread(ctx, SlowRequest.ffiNewReq(testCallback, addr slow)).isOk()
|
||||
|
||||
var dD: CallbackData
|
||||
initCallbackData(dD)
|
||||
defer:
|
||||
deinitCallbackData(dD)
|
||||
check testlib_destroy(cast[pointer](ctx), testCallback, addr dD) == RET_OK
|
||||
waitCallback(dD)
|
||||
check dD.retCode == RET_OK
|
||||
# Drained: the in-flight handler ran to completion before destroy reported OK.
|
||||
check slow.called
|
||||
check callbackMsg(slow) == "slow-done"
|
||||
|
||||
let ctx2 = createSimpleLib(2)
|
||||
check ctx2 == ctx
|
||||
check SimpleLibFFIPool.destroyFFIContext(ctx2).isOk()
|
||||
|
||||
test "requests are rejected once a destroy closes the gate":
|
||||
let ctx = createSimpleLib(3)
|
||||
|
||||
var dD: CallbackData
|
||||
initCallbackData(dD)
|
||||
defer:
|
||||
deinitCallbackData(dD)
|
||||
check testlib_destroy(cast[pointer](ctx), testCallback, addr dD) == RET_OK
|
||||
waitCallback(dD)
|
||||
check dD.retCode == RET_OK
|
||||
|
||||
# Gate stays closed until the context is reacquired: a late request must not
|
||||
# dispatch onto a context about to be (or already) reused.
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer:
|
||||
deinitCallbackData(d)
|
||||
check sendRequestToFFIThread(
|
||||
ctx, SlowRequest.ffiNewReq(testCallback, addr d)
|
||||
).isErr()
|
||||
|
||||
let ctx2 = createSimpleLib(4)
|
||||
check ctx2 == ctx
|
||||
check SimpleLibFFIPool.destroyFFIContext(ctx2).isOk()
|
||||
|
||||
test "a stuck context is reported as RET_ERR rather than hanging":
|
||||
let ctx = createSimpleLib(8)
|
||||
|
||||
let savedTimeout = RecycleTimeout
|
||||
RecycleTimeout = 150.milliseconds
|
||||
defer:
|
||||
RecycleTimeout = savedTimeout
|
||||
|
||||
# In-flight handler outlasts the (shortened) drain timeout.
|
||||
var slow: CallbackData
|
||||
initCallbackData(slow)
|
||||
defer:
|
||||
deinitCallbackData(slow)
|
||||
check sendRequestToFFIThread(ctx, SlowRequest.ffiNewReq(testCallback, addr slow)).isOk()
|
||||
|
||||
var dD: CallbackData
|
||||
initCallbackData(dD)
|
||||
defer:
|
||||
deinitCallbackData(dD)
|
||||
check testlib_destroy(cast[pointer](ctx), testCallback, addr dD) == RET_OK
|
||||
waitCallback(dD)
|
||||
check dD.retCode == RET_ERR # drain timed out -> ctx reported stuck
|
||||
|
||||
# The stuck context is leaked (not reused); the handler still finishes on its
|
||||
# own. Wait for it, then fully tear the leaked context down.
|
||||
waitCallback(slow)
|
||||
check SimpleLibFFIPool.destroyFFIContext(ctx).isOk()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Simplified .ffi. macro integration test
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ffiType:
|
||||
type SendConfig = object
|
||||
message: string
|
||||
|
||||
proc testlib_send*(
|
||||
lib: SimpleLib, cfg: SendConfig
|
||||
): Future[Result[string, string]] {.ffi.} =
|
||||
return ok("echo:" & cfg.message & ":" & $lib.value)
|
||||
|
||||
suite "simplified .ffi. macro":
|
||||
test "sends request and gets serialized response via callback":
|
||||
# First create a context using ffiCtor
|
||||
var ctorD: CallbackData
|
||||
initCallbackData(ctorD)
|
||||
defer: deinitCallbackData(ctorD)
|
||||
|
||||
let configJson = ffiSerialize(SimpleConfig(initialValue: 7))
|
||||
let ctorRet = testlib_create(configJson.cstring, testCallback, addr ctorD)
|
||||
check not ctorRet.isNil()
|
||||
|
||||
waitCallback(ctorD)
|
||||
check ctorD.retCode == RET_OK
|
||||
|
||||
let addrStr = callbackMsg(ctorD)
|
||||
check addrStr.len > 0
|
||||
|
||||
let ctxAddr = cast[uint](parseBiggestUInt(addrStr))
|
||||
check ctxAddr != 0
|
||||
let ctx = cast[ptr FFIContext[SimpleLib]](ctxAddr)
|
||||
defer: check SimpleLibFFIPool.destroyFFIContext(ctx).isOk()
|
||||
|
||||
# Now call the .ffi. proc
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer: deinitCallbackData(d)
|
||||
|
||||
let cfgJson = ffiSerialize(SendConfig(message: "hello"))
|
||||
let ret = testlib_send(ctx, testCallback, addr d, cfgJson.cstring)
|
||||
check ret == RET_OK
|
||||
|
||||
waitCallback(d)
|
||||
check d.retCode == RET_OK
|
||||
|
||||
let receivedMsg = callbackMsg(d)
|
||||
let decoded = ffiDeserialize(receivedMsg.cstring, string).valueOr:
|
||||
check false
|
||||
""
|
||||
check decoded == "echo:hello:7"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# async/sync detection in .ffi. macro integration test
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Sync proc (no await in body) — macro detects this and bypasses thread machinery
|
||||
proc testlib_version*(
|
||||
lib: SimpleLib
|
||||
): Future[Result[string, string]] {.ffi.} =
|
||||
return ok("v" & $lib.value)
|
||||
|
||||
suite "async/sync detection in .ffi.":
|
||||
test "sync proc invokes callback without thread hop":
|
||||
# Create a context using ffiCtor
|
||||
var ctorD: CallbackData
|
||||
initCallbackData(ctorD)
|
||||
defer: deinitCallbackData(ctorD)
|
||||
|
||||
let configJson = ffiSerialize(SimpleConfig(initialValue: 3))
|
||||
let ctorRet = testlib_create(configJson.cstring, testCallback, addr ctorD)
|
||||
check not ctorRet.isNil()
|
||||
|
||||
waitCallback(ctorD)
|
||||
check ctorD.retCode == RET_OK
|
||||
|
||||
let addrStr = callbackMsg(ctorD)
|
||||
check addrStr.len > 0
|
||||
|
||||
let ctxAddr = cast[uint](parseBiggestUInt(addrStr))
|
||||
check ctxAddr != 0
|
||||
let ctx = cast[ptr FFIContext[SimpleLib]](ctxAddr)
|
||||
defer: check SimpleLibFFIPool.destroyFFIContext(ctx).isOk()
|
||||
|
||||
var d2: CallbackData
|
||||
initCallbackData(d2)
|
||||
defer: deinitCallbackData(d2)
|
||||
|
||||
# Call sync proc — callback should fire before the proc returns (no thread hop)
|
||||
let ret = testlib_version(ctx, testCallback, addr d2)
|
||||
# No sleep needed: sync path fires callback inline before returning
|
||||
check ret == RET_OK
|
||||
check d2.called # fires synchronously — no waitCallback needed
|
||||
check d2.retCode == RET_OK
|
||||
let receivedMsg = callbackMsg(d2)
|
||||
let decoded = ffiDeserialize(receivedMsg.cstring, string).valueOr:
|
||||
check false
|
||||
""
|
||||
check decoded == "v3"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ptr T return type in .ffi. macro integration test
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
type Handle = object
|
||||
data: string
|
||||
|
||||
ffiType:
|
||||
type NameParam = object
|
||||
name: string
|
||||
|
||||
proc testlib_alloc_handle*(
|
||||
lib: SimpleLib, np: NameParam
|
||||
): Future[Result[ptr Handle, string]] {.ffi.} =
|
||||
let h = createShared(Handle)
|
||||
h[] = Handle(data: np.name & ":" & $lib.value)
|
||||
return ok(h)
|
||||
|
||||
proc testlib_read_handle*(
|
||||
lib: SimpleLib, handle: pointer
|
||||
): Future[Result[string, string]] {.ffi.} =
|
||||
let h = cast[ptr Handle](handle)
|
||||
return ok(h[].data)
|
||||
|
||||
proc testlib_free_handle*(
|
||||
lib: SimpleLib, handle: pointer
|
||||
): Future[Result[string, string]] {.ffi.} =
|
||||
let h = cast[ptr Handle](handle)
|
||||
deallocShared(h)
|
||||
return ok("freed")
|
||||
|
||||
suite "ptr return type in .ffi.":
|
||||
test "returns a heap-allocated handle and reads it back":
|
||||
# Create context via ffiCtor
|
||||
var ctorD: CallbackData
|
||||
initCallbackData(ctorD)
|
||||
defer: deinitCallbackData(ctorD)
|
||||
|
||||
let configJson = ffiSerialize(SimpleConfig(initialValue: 5))
|
||||
let ctorRet = testlib_create(configJson.cstring, testCallback, addr ctorD)
|
||||
check not ctorRet.isNil()
|
||||
|
||||
waitCallback(ctorD)
|
||||
check ctorD.retCode == RET_OK
|
||||
|
||||
let ctxAddrStr = callbackMsg(ctorD)
|
||||
check ctxAddrStr.len > 0
|
||||
let ctxAddr = cast[uint](parseBiggestUInt(ctxAddrStr))
|
||||
check ctxAddr != 0
|
||||
let ctx = cast[ptr FFIContext[SimpleLib]](ctxAddr)
|
||||
defer: check SimpleLibFFIPool.destroyFFIContext(ctx).isOk()
|
||||
|
||||
# Alloc a handle
|
||||
var allocD: CallbackData
|
||||
initCallbackData(allocD)
|
||||
defer: deinitCallbackData(allocD)
|
||||
|
||||
let npJson = ffiSerialize(NameParam(name: "test"))
|
||||
let allocRet = testlib_alloc_handle(ctx, testCallback, addr allocD, npJson.cstring)
|
||||
check allocRet == RET_OK
|
||||
|
||||
waitCallback(allocD)
|
||||
check allocD.retCode == RET_OK
|
||||
|
||||
let handleAddrStr = callbackMsg(allocD)
|
||||
check handleAddrStr.len > 0
|
||||
let handleAddr = parseBiggestUInt(handleAddrStr)
|
||||
check handleAddr != 0
|
||||
|
||||
# Read the handle back
|
||||
var readD: CallbackData
|
||||
initCallbackData(readD)
|
||||
defer: deinitCallbackData(readD)
|
||||
|
||||
let handleJson = ffiSerialize(cast[pointer](handleAddr))
|
||||
let readRet = testlib_read_handle(ctx, testCallback, addr readD, handleJson.cstring)
|
||||
check readRet == RET_OK
|
||||
|
||||
waitCallback(readD)
|
||||
check readD.retCode == RET_OK
|
||||
|
||||
let readMsg = callbackMsg(readD)
|
||||
let decodedStr = ffiDeserialize(readMsg.cstring, string).valueOr:
|
||||
check false
|
||||
""
|
||||
check decodedStr == "test:5"
|
||||
|
||||
# Free the handle
|
||||
var freeD: CallbackData
|
||||
initCallbackData(freeD)
|
||||
defer: deinitCallbackData(freeD)
|
||||
|
||||
let freeRet = testlib_free_handle(ctx, testCallback, addr freeD, handleJson.cstring)
|
||||
check freeRet == RET_OK
|
||||
|
||||
waitCallback(freeD)
|
||||
check freeD.retCode == RET_OK
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# releaseFFIContext: park & reuse (fd-leak regression)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
proc countOpenFds(): int =
|
||||
## Number of open fds for this process, or -1 if not determinable on this
|
||||
## platform. On Linux we count /proc/self/fd; elsewhere we shell out to lsof
|
||||
## (skipped if lsof is unavailable, e.g. Windows).
|
||||
when defined(linux):
|
||||
var n = 0
|
||||
for _ in walkDir("/proc/self/fd"):
|
||||
inc n
|
||||
return n
|
||||
else:
|
||||
if findExe("lsof").len == 0:
|
||||
return -1
|
||||
try:
|
||||
let output =
|
||||
execProcess("lsof", args = ["-p", $getCurrentProcessId()], options = {poUsePath})
|
||||
return output.splitLines().countIt(it.len > 0)
|
||||
except CatchableError:
|
||||
return -1
|
||||
|
||||
proc releaseAndWait[T](ctx: ptr FFIContext[T]): cint =
|
||||
## Test helper mirroring how a C consumer destroys a context: kick off the
|
||||
## (non-blocking) teardown and block on the callback, returning its retCode.
|
||||
## RET_OK means the lib's in-flight tasks finished and the context was parked.
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer:
|
||||
deinitCallbackData(d)
|
||||
if ctx.releaseFFIContext(testCallback, addr d).isErr():
|
||||
return RET_ERR
|
||||
waitCallback(d)
|
||||
return d.retCode
|
||||
|
||||
suite "releaseFFIContext (park & reuse)":
|
||||
test "park returns the context and reuses the same live worker":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx1 = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
check ctx1.releaseAndWait() == RET_OK
|
||||
|
||||
# Reacquire: must be the same context, with its worker still running.
|
||||
let ctx2 = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
check ctx1 == ctx2
|
||||
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
defer:
|
||||
deinitCallbackData(d)
|
||||
check sendRequestToFFIThread(
|
||||
ctx2, PingRequest.ffiNewReq(testCallback, addr d, "reuse".cstring)
|
||||
).isOk()
|
||||
waitCallback(d)
|
||||
check d.retCode == RET_OK
|
||||
check callbackMsg(d) == "pong:reuse" # reused worker still processes requests
|
||||
|
||||
check pool.destroyFFIContext(ctx2).isOk()
|
||||
|
||||
test "park drops the stale event callback and library pointer":
|
||||
var pool: FFIContextPool[TestLib]
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
ctx.callbackState.callback = cast[pointer](testCallback)
|
||||
ctx.callbackState.userData = cast[pointer](0xDEAD)
|
||||
|
||||
check ctx.releaseAndWait() == RET_OK
|
||||
check ctx.callbackState.callback.isNil() # a watchdog tick can't call a freed cb
|
||||
check ctx.callbackState.userData.isNil()
|
||||
check ctx.myLib.isNil()
|
||||
|
||||
check pool.destroyFFIContext(ctx).isOk()
|
||||
|
||||
test "fd usage stays bounded across many park/reuse cycles":
|
||||
if countOpenFds() < 0:
|
||||
skip() # no fd-counting facility on this platform
|
||||
else:
|
||||
var pool: FFIContextPool[TestLib]
|
||||
|
||||
# Warm up: the first create builds the context's worker (its fds are allocated
|
||||
# once here); parking keeps them open for reuse.
|
||||
block:
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
check ctx.releaseAndWait() == RET_OK
|
||||
|
||||
let baseline = countOpenFds()
|
||||
|
||||
for _ in 0 ..< 20:
|
||||
let ctx = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
var d: CallbackData
|
||||
initCallbackData(d)
|
||||
check sendRequestToFFIThread(
|
||||
ctx, PingRequest.ffiNewReq(testCallback, addr d, "x".cstring)
|
||||
).isOk()
|
||||
waitCallback(d)
|
||||
deinitCallbackData(d)
|
||||
check ctx.releaseAndWait() == RET_OK
|
||||
|
||||
let afterCycles = countOpenFds()
|
||||
# Reuse must not grow fds. Before the fix each cycle leaked ~10 fds (4
|
||||
# ThreadSignalPtr socketpairs + 2 dispatcher kqueues); the small slack
|
||||
# only tolerates unrelated runtime fd noise, not a per-cycle leak.
|
||||
check afterCycles <= baseline + 5
|
||||
|
||||
# Tear the (still parked) context's worker down so the test leaves no threads.
|
||||
let last = pool.createFFIContext().valueOr:
|
||||
check false
|
||||
return
|
||||
check pool.destroyFFIContext(last).isOk()
|
||||
Loading…
x
Reference in New Issue
Block a user