fix: foreign-host (Go) concurrency crashes (#83)

This commit is contained in:
Ivan FB 2026-06-18 21:44:06 +02:00 committed by GitHub
parent e86b136e79
commit 0a6bf6c49f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 54 additions and 13 deletions

View File

@ -75,7 +75,7 @@ jobs:
if [ "$RUNNER_OS" == "Windows" ]; then
export PATH="$GITHUB_WORKSPACE/.nim_runtime/bin:$HOME/.nimble/bin:$PATH"
fi
nim c -r --mm:${{ matrix.mm }} -d:chronicles_log_level=WARN tests/test_alloc.nim
nim c -r --mm:${{ matrix.mm }} -d:chronicles_log_level=WARN -d:ffiAllowSignalHandler tests/test_alloc.nim
- name: Run FFI context tests (${{ matrix.mm }})
shell: bash
@ -83,7 +83,7 @@ jobs:
if [ "$RUNNER_OS" == "Windows" ]; then
export PATH="$GITHUB_WORKSPACE/.nim_runtime/bin:$HOME/.nimble/bin:$PATH"
fi
nim c -r --mm:${{ matrix.mm }} -d:chronicles_log_level=WARN tests/test_ffi_context.nim
nim c -r --mm:${{ matrix.mm }} -d:chronicles_log_level=WARN -d:ffiAllowSignalHandler tests/test_ffi_context.nim
- name: Run GC compatibility tests (${{ matrix.mm }})
shell: bash
@ -91,4 +91,4 @@ jobs:
if [ "$RUNNER_OS" == "Windows" ]; then
export PATH="$GITHUB_WORKSPACE/.nim_runtime/bin:$HOME/.nimble/bin:$PATH"
fi
nim c -r --mm:${{ matrix.mm }} -d:chronicles_log_level=WARN tests/test_gc_compat.nim
nim c -r --mm:${{ matrix.mm }} -d:chronicles_log_level=WARN -d:ffiAllowSignalHandler tests/test_gc_compat.nim

View File

@ -38,6 +38,11 @@ type CtxLifecycle {.pure.} = enum
type FFIContext*[T] = object
myLib*: ptr T
# main library object (e.g., Waku, LibP2P, SDS, the one to be exposed as a library)
myLibRefd: bool
# refc only: whether we hold a GC_ref on myLib[]. myLib lives in non-GC
# shared memory, so a GC-managed lib object stored there is invisible to
# refc's cycle collector and gets reclaimed mid-use under sustained load.
# Pinned in processRequest, released in freeLib. Unused under orc/arc.
ffiThread: Thread[(ptr FFIContext[T])]
# represents the main FFI thread in charge of attending API consumer actions
watchdogThread: Thread[(ptr FFIContext[T])]
@ -243,6 +248,15 @@ proc processRequest[T](
"Async error in processRequest for " & reqId & ": " & exc.msg
)
# Pin the lib object as a GC root once a handler has installed it. Under refc
# myLib lives in non-GC shared memory, invisible to the cycle collector, so it
# gets reclaimed mid-operation under sustained load. orc tracks it precisely.
when defined(gcRefc):
when T is ref:
if not ctx.myLibRefd and not ctx.myLib.isNil():
GC_ref(ctx.myLib[])
ctx.myLibRefd = true
## handleRes may raise (OOM, GC setup) even though it is rare.
try:
handleRes(res, request)
@ -254,10 +268,16 @@ proc freeLib[T](ctx: ptr FFIContext[T]) {.gcsafe.} =
return
when not defined(gcRefc):
{.cast(gcsafe).}:
`=destroy`(ctx.myLib[])
try:
{.cast(gcsafe).}:
`=destroy`(ctx.myLib[])
except Exception:
discard
else:
discard
when T is ref:
if ctx.myLibRefd:
GC_unref(ctx.myLib[])
ctx.myLibRefd = false
freeShared(ctx.myLib)
ctx.myLib = nil

View File

@ -53,10 +53,13 @@ macro declareLibraryBase*(libraryName: static[string]): untyped =
)
res.add(procDef)
# Create: var initialized: Atomic[bool]
# Create: var initialized, initDone: Atomic[bool]
# `initialized` elects the single thread that runs NimMain; `initDone` signals
# that NimMain has finished so concurrent first-time callers can safely proceed.
let atomicType = nnkBracketExpr.newTree(ident("Atomic"), ident("bool"))
let varStmt = nnkVarSection.newTree(
nnkIdentDefs.newTree(ident("initialized"), atomicType, newEmptyNode())
nnkIdentDefs.newTree(ident("initialized"), atomicType, newEmptyNode()),
nnkIdentDefs.newTree(ident("initDone"), atomicType, newEmptyNode()),
)
res.add(varStmt)
@ -80,6 +83,13 @@ macro declareLibraryBase*(libraryName: static[string]): untyped =
## Being `<yourprefix>` the value given in the optional
## compilation flag --nimMainPrefix:yourprefix
`nimMainName`()
initDone.store(true)
else:
## Another thread won the election and is running (or has run) NimMain.
## Block until it finishes: proceeding now would race a half-initialized
## Nim runtime/GC and corrupt the heap on concurrent first-time calls.
while not initDone.load():
discard
when declared(setupForeignThreadGc):
setupForeignThreadGc()
when declared(nimGC_setStackBottom):

View File

@ -234,12 +234,18 @@ proc buildFfiNewReqProc(reqTypeName, body: NimNode): NimNode =
`reqObjIdent`[].`fieldNameIdent` = `fieldNameIdent`
)
# FFIThreadRequest.init using fnv1aHash32
# The request id is the type name. Use the compile-time string literal (same
# value as `$T`, and as the registry key in addNewRequestToRegistry) rather
# than `$T`, which allocates a GC string on the *calling* thread — under a
# foreign host (Go) those are transient/concurrent OS threads where running
# Nim's GC corrupts memory. `cstring(<literal>)` points at static data.
let reqNameLit = newLit(
if reqTypeName.kind == nnkPostfix: $reqTypeName[1] else: $reqTypeName
)
newBody.add(
quote do:
let typeStr = $T
var ret =
FFIThreadRequest.init(callback, userData, typeStr.cstring, `reqObjIdent`)
FFIThreadRequest.init(callback, userData, cstring(`reqNameLit`), `reqObjIdent`)
proc destroyContent(content: pointer) {.nimcall.} =
ffiDeleteReq(cast[ptr `reqTypeName`](content))
@ -1113,9 +1119,14 @@ proc buildCtorFfiNewReqProc(reqTypeName: NimNode, paramNames: seq[string]): NimN
newBody.add quote do:
`reqObjIdent`[].`fieldName` = `fieldName`.alloc()
# Compile-time type-name literal instead of `$T` — `$T` allocates a GC string
# on the calling thread, which corrupts memory under a foreign host (Go) whose
# callers are transient/concurrent OS threads. See buildFfiNewReqProc.
let reqNameLit = newLit(
if reqTypeName.kind == nnkPostfix: $reqTypeName[1] else: $reqTypeName
)
newBody.add quote do:
let typeStr = $T
var ret = FFIThreadRequest.init(callback, userData, typeStr.cstring, `reqObjIdent`)
var ret = FFIThreadRequest.init(callback, userData, cstring(`reqNameLit`), `reqObjIdent`)
proc destroyContent(content: pointer) {.nimcall.} =
ffiDeleteReq(cast[ptr `reqTypeName`](content))