changing async logic for nodes (multi) response

The previous version was based on the assumption that
that the future of waitMessage is triggered in the
event loop before the next message arrives. This seems
to be true with UDP, where a full cycle of the loop
handles only one datagram, but not true with other
transports, such as the in-memory emulation. This
led to the following behavior:
- 1st response arrives, reqId removed, waitMessage called
- 2nd ... last messages arrive, can't find reqId, so dropped
- waitMessage competes, re-adding reqId, but too late

The new version solves this by collecting SPRs explicitly

Signed-off-by: Csaba Kiraly <csaba.kiraly@gmail.com>
This commit is contained in:
Csaba Kiraly 2023-06-02 22:17:04 +02:00
parent 50dd3cd7b3
commit d496a47394
No known key found for this signature in database
GPG Key ID: 0FE274EE8C95166E

View File

@ -162,6 +162,7 @@ type
transport*: Transport[Protocol] # exported for tests
routingTable*: RoutingTable
awaitedMessages: Table[(NodeId, RequestId), Future[Option[Message]]]
awaitedNodesMessages: Table[(NodeId, RequestId), (Future[Result[seq[SignedPeerRecord],cstring]], uint32, seq[SignedPeerRecord])]
refreshLoop: Future[void]
revalidateLoop: Future[void]
ipMajorityLoop: Future[void]
@ -424,6 +425,26 @@ proc handleMessage(d: Protocol, srcId: NodeId, fromAddr: Address,
discovery_message_requests_incoming.inc(labelValues = ["no_response"])
trace "Received unimplemented message kind", kind = message.kind,
origin = fromAddr
of nodes:
trace "node-response message received"
var sprs = message.nodes.sprs
let total = message.nodes.total
trace "waiting for more nodes messages", me=d.localNode, srcId, total
try:
var (waiter, cnt, s) = d.awaitedNodesMessages[(srcId, message.reqId)]
cnt += 1
s.add(sprs)
d.awaitedNodesMessages[(srcId, message.reqId)] = (waiter, cnt, s)
trace "nodes collected", me=d.localNode, srcId, cnt, s
if cnt == total:
d.awaitedNodesMessages.del((srcId, message.reqId))
trace "all nodes responses received", me=d.localNode, srcId
waiter.complete(DiscResult[seq[SignedPeerRecord]].ok(s))
except KeyError:
discovery_unsolicited_messages.inc()
warn "Timed out or unrequested message", kind = message.kind,
origin = fromAddr
else:
var waiter: Future[Option[Message]]
if d.awaitedMessages.take((srcId, message.reqId), waiter):
@ -495,33 +516,24 @@ proc waitNodeResponses*[T: SomeMessage](d: Protocol, node: Node, msg: T):
result = d.waitNodes(node, reqId)
sendRequest(d, node, msg, reqId)
proc waitNodes(d: Protocol, fromNode: Node, reqId: RequestId):
Future[DiscResult[seq[SignedPeerRecord]]] {.async.} =
proc waitNodes(d: Protocol, fromNode: Node, reqId: RequestId, timeout = ResponseTimeout):
Future[DiscResult[seq[SignedPeerRecord]]] =
## Wait for one or more nodes replies.
##
## The first reply will hold the total number of replies expected, and based
## on that, more replies will be awaited.
## If one reply is lost here (timed out), others are ignored too.
## Same counts for out of order receival.
var op = await d.waitMessage(fromNode, reqId)
if op.isSome:
if op.get.kind == MessageKind.nodes:
var res = op.get.nodes.sprs
let total = op.get.nodes.total
for i in 1 ..< total:
op = await d.waitMessage(fromNode, reqId)
if op.isSome and op.get.kind == MessageKind.nodes:
res.add(op.get.nodes.sprs)
else:
# No error on this as we received some nodes.
break
return ok(res)
else:
discovery_message_requests_outgoing.inc(labelValues = ["invalid_response"])
return err("Invalid response to find node message")
else:
discovery_message_requests_outgoing.inc(labelValues = ["no_response"])
return err("Nodes message not received in time")
## TODO: these are VERY optimistic assumptions here. We need a short timeout if we collect
result = newFuture[DiscResult[seq[SignedPeerRecord]]]("waitNodesMessages")
let res = result
let key = (fromNode.id, reqId)
sleepAsync(timeout).addCallback() do(data: pointer):
d.awaitedNodesMessages.del(key)
if not res.finished:
res.complete(DiscResult[seq[SignedPeerRecord]].err("waitNodeMessages timed out"))
d.awaitedNodesMessages[key] = (result, 0.uint32, newSeq[SignedPeerRecord]())
proc ping*(d: Protocol, toNode: Node):
Future[DiscResult[PongMessage]] {.async.} =