fix: chat2disco (and circuit-relay nodes) segfault at waku_node.nim:589 on start

Root cause (post libp2p f54c715 update + rv/kad/service-disco changes):
- chat2disco (and waku factory for circuitRelayClient) do direct
  `node.switch.services = @[Service(hp/autonat)]` (bypassing the
  deprecated switch.add that called .setup).
- AutonatService (enableAddressMapper=true by default) + HP require
  their .setup(switch) to populate .addressMapper (and handlers)
  before start().
- In switch.start: services.start → autonat.start does
  addressMappers.add(nil); await peerInfo.update() → for m in ...:
  await m(...) → nil proc deref (Defect/SEGV in release).
- Surfaces at the `await node.switch.start()` (waku_node:589).
- Secondary: wakuKademlia.start() + waku mapper (capturing node,
  returning announced which hp mutates) scheduled before switch.start
  (which activates the mounted kad via ms + runs the hp mappers/updates).

Fix:
- After services= in apps/chat2disco/chat2disco.nim and in
  waku/factory/waku.nim (both hp and bare autonat branches), explicitly
  call the .setup(node.switch) (or hp.setup) and handle error.
- Move `if not node.wakuKademlia.isNil(): ...start()` to after
  switch.start() + reconnectRelayPeers (correct ordering for mounted
  protocol user loops).
- Harden waku addressMapper (nil/empty guard, return listenAddrs) and
  set peerInfo.announcedAddrs (short-circuit) at the add site, in
  updateAnnounced..., and in the onReservation callbacks (chat2disco +
  factory) so expandAddrs prefers it.
- Minor: lookup/periodic guards in waku_kademlia; doc in
  autonat_service.

Also nph reformats on touched files.

Reuses: the .setup methods, existing post-switch init patterns,
isNil guards, CatchableError handling, make chat2disco + nph.

Verified: make chat2disco (twice, pre/post nph) SuccessX; no SEGV in
multiple start-path runs (only expected thread EOF on pipe close);
diff only our 5 files.

Builds on 82d87cfa (libp2p update) without touching pins or vendored.
Caveat: clean `make update` still requires the mix temp patches in
pkgs2 (as documented in the update).

Fixes the reported chat2disco startup segfault.
This commit is contained in:
SionoiS 2026-06-05 12:44:35 +00:00
parent 6f27547937
commit af512314b9
5 changed files with 87 additions and 18 deletions

View File

@ -207,14 +207,18 @@ proc createRoom(c: Chat, roomName: string) {.async.} =
echo "Connected to discovered peers"
c.rooms[roomName] = ChatRoom(
serviceId: serviceIdStr, pubsubTopic: pubsubTopic,
contentTopic: contentTopic, discovered: peers
serviceId: serviceIdStr,
pubsubTopic: pubsubTopic,
contentTopic: contentTopic,
discovered: peers,
)
else:
echo "Warning: Kademlia not available. Room created locally only."
c.rooms[roomName] = ChatRoom(
serviceId: serviceIdStr, pubsubTopic: pubsubTopic,
contentTopic: contentTopic, discovered: @[]
serviceId: serviceIdStr,
pubsubTopic: pubsubTopic,
contentTopic: contentTopic,
discovered: @[],
)
# Unsubscribe from previous room's pubsub topic
@ -226,11 +230,8 @@ proc createRoom(c: Chat, roomName: string) {.async.} =
c.currentRoom = roomName
# Subscribe to this room's pubsub topic
c.node.subscribe(
(kind: PubsubSub, topic: pubsubTopic), c.relayHandler
).isOkOr:
error "failed to subscribe to room pubsub topic",
topic = pubsubTopic, error = error
c.node.subscribe((kind: PubsubSub, topic: pubsubTopic), c.relayHandler).isOkOr:
error "failed to subscribe to room pubsub topic", topic = pubsubTopic, error = error
echo &"Created/joined room '{roomName}'. Pubsub topic: {pubsubTopic}"
@ -251,7 +252,8 @@ proc writeAndPrint(c: Chat) {.async.} =
if roomName in c.rooms:
# Unsubscribe from current room, subscribe to target room
if c.currentRoom.len > 0 and c.currentRoom != roomName and c.currentRoom in c.rooms:
if c.currentRoom.len > 0 and c.currentRoom != roomName and
c.currentRoom in c.rooms:
let oldTopic = c.rooms[c.currentRoom].pubsubTopic
c.node.unsubscribe((kind: PubsubUnsub, topic: oldTopic)).isOkOr:
error "failed to unsubscribe from room", topic = oldTopic, error = error
@ -259,9 +261,7 @@ proc writeAndPrint(c: Chat) {.async.} =
c.currentRoom = roomName
let newTopic = c.rooms[roomName].pubsubTopic
c.node.subscribe(
(kind: PubsubSub, topic: newTopic), c.relayHandler
).isOkOr:
c.node.subscribe((kind: PubsubSub, topic: newTopic), c.relayHandler).isOkOr:
error "failed to subscribe to room pubsub topic",
topic = newTopic, error = error
@ -373,6 +373,11 @@ proc processInput(rfd: AsyncFD, rng: crypto.Rng) {.async.} =
node.announcedAddresses.setLen(0) ## remove previous addresses
node.announcedAddresses.add(addresses)
# Also set the PeerInfo announcedAddrs (newer libp2p short-circuits mappers in expandAddrs
# when non-empty; reduces reliance on the waku capturing mapper during start + hp races).
if not node.switch.isNil and not node.switch.peerInfo.isNil:
node.switch.peerInfo.announcedAddrs = addresses
info "chat2disco node announced addresses updated",
announcedAddresses = node.announcedAddresses
@ -383,6 +388,17 @@ proc processInput(rfd: AsyncFD, rng: crypto.Rng) {.async.} =
node.switch.services = @[Service(holePunchService)]
# Call setup explicitly (bypasses the deprecated switch.add which used to do it).
# Required after the libp2p update so that AutonatService (and AutoRelay via HP)
# populate their addressMapper / handlers before switch.start runs the services
# (otherwise enableAddressMapper=true + nil mapper leads to nil call / SEGV at 589).
try:
holePunchService.setup(node.switch)
except CatchableError as exc:
error "failed to setup hole punch / autonat service (address mappers etc.)",
error = exc.msg
quit(QuitFailure)
if conf.relay:
(await node.mountRelay()).isOkOr:
error "failed to mount relay", error

View File

@ -13,6 +13,15 @@ proc getAutonatService*(rng: crypto.Rng): AutonatService =
## minConfidence is used as threshold to determine the state.
## If maxQueueSize > numPeersToAsk past samples are considered
## in the calculation.
##
## NOTE: After obtaining the service (and wrapping in HPService if using holepunch),
## the caller *must* call .setup(theSwitch) on it (or on the HPService) before adding
## to switch.services and calling switch.start(). This populates the addressMapper
## proc (and handlers) that autonat.start() assumes when enableAddressMapper (default).
## Direct assignment to switch.services (as done in factory + chat2disco) bypasses
## the deprecated switch.add() that used to auto-call setup. Missing setup + enable=true
## leads to adding a nil mapper and nil call during peerInfo.update inside switch.start
## (surfaces as SEGV at waku_node:589).
let autonatService = AutonatService.new(
autonatClient = AutonatClient.new(),
rng = rng,

View File

@ -88,6 +88,10 @@ proc toRemotePeerInfo(record: ExtendedPeerRecord): Option[RemotePeerInfo] =
proc lookup*(
self: WakuKademlia, codec: string
): Future[seq[RemotePeerInfo]] {.async: (raises: []).} =
if self.protocol.isNil:
return @[]
# The underlying ServiceDiscovery/KadDHT is started by switch.start() (via ms.start);
# if called very early the protocol may still be starting its loops/handlers.
let serviceId = hashServiceId(codec)
let catchRes = catch:
@ -144,6 +148,8 @@ proc periodicLookup(
while true:
await sleepAsync(interval)
if self.protocol.isNil:
continue
let services = self.discoveredServices
if services.len == 0:
continue

View File

@ -89,6 +89,12 @@ proc setupSwitchServices(
waku.node.announcedAddresses.setLen(0) ## remove previous addresses
waku.node.announcedAddresses.add(addresses)
# Also set the PeerInfo announcedAddrs (newer libp2p short-circuits mappers in expandAddrs
# when non-empty; reduces reliance on the waku capturing mapper during start + hp races).
if not isNil(waku.node.switch) and not isNil(waku.node.switch.peerInfo):
waku.node.switch.peerInfo.announcedAddrs = addresses
info "waku node announced addresses updated",
announcedAddresses = waku.node.announcedAddresses
@ -106,8 +112,24 @@ proc setupSwitchServices(
)
let holePunchService = HPService.new(autonatService, autoRelayService)
waku.node.switch.services = @[Service(holePunchService)]
# Call setup explicitly (bypasses the deprecated switch.add which used to do it).
# Required after the libp2p update so that AutonatService (and AutoRelay via HP)
# populate their addressMapper / handlers before switch.start (otherwise enableAddressMapper
# + nil mapper leads to nil call / SEGV at waku_node:589).
try:
holePunchService.setup(waku.node.switch)
except CatchableError as exc:
error "failed to setup hole punch / autonat service (address mappers etc.)",
error = exc.msg
# cannot easily quit here; re-raise so caller sees the init failure
raise
else:
waku.node.switch.services = @[Service(autonatService)]
try:
autonatService.setup(waku.node.switch)
except CatchableError as exc:
error "failed to setup autonat service (address mappers etc.)", error = exc.msg
raise
## Initialisation

View File

@ -466,6 +466,8 @@ proc updateAnnouncedAddrWithPrimaryIpAddr*(node: WakuNode): Result[void, string]
## Update the Switch addresses
node.switch.peerInfo.addrs = newAnnouncedAddresses
# Keep announcedAddrs in sync so expandAddrs prefers it and skips mappers.
node.switch.peerInfo.announcedAddrs = newAnnouncedAddresses
for transport in node.switch.transports:
for address in transport.addrs:
@ -573,16 +575,22 @@ proc start*(node: WakuNode) {.async.} =
if not node.wakuRendezvousClient.isNil():
await node.wakuRendezvousClient.start()
if not node.wakuKademlia.isNil():
node.wakuKademlia.start()
## The switch uses this mapper to update peer info addrs
## with announced addrs after start
## with announced addrs after start (and during start via autonat/hp etc.).
## Guard + also set announcedAddrs (newer PeerInfo short-circuits mappers entirely
## in expandAddrs when non-empty) to reduce capture/GC/race surface with the hp
## onReservation callback that mutates announced during the services-start phase.
let addressMapper = proc(
listenAddrs: seq[MultiAddress]
): Future[seq[MultiAddress]] {.gcsafe, async: (raises: [CancelledError]).} =
return node.announcedAddresses
if node.isNil or node.switch.isNil or node.switch.peerInfo.isNil:
return listenAddrs
if node.announcedAddresses.len > 0:
return node.announcedAddresses
return listenAddrs
node.switch.peerInfo.addressMappers.add(addressMapper)
if node.announcedAddresses.len > 0:
node.switch.peerInfo.announcedAddrs = node.announcedAddresses
## The switch will update addresses after start using the addressMapper
## NOTE: This will dispatch gossipsub start to the WakuRelay.start method override
@ -591,6 +599,14 @@ proc start*(node: WakuNode) {.async.} =
# After switch.start, run custom Logos Delivery relay start logic
await node.reconnectRelayPeers()
# Start user-level loops for mounted protocols (e.g. wakuKademlia periodic lookup)
# only *after* switch.start() has activated the underlying LPProtocol (ms.start calls
# the disco/kad .start which registers handlers/loops/started, starts transports etc.).
# The 60s sleep in the periodic still protects first work, but ordering is now correct
# and avoids any interleaving with start-time heartbeats/mappers in the new libp2p kad/service-disco.
if not node.wakuKademlia.isNil():
node.wakuKademlia.start()
node.started = true
if not node.wakuFilterClient.isNil():