Fix flaky portal tests - part II (#2829)

* Remove two simple sleeps from history network test

Replace simple sleeps in test_history_network with retries +
sleeps. Current simple sleep setting would occasionally still
fail on CI.

* Increase retries on test_portal_testnet from 2 to 3.

* Add 1 second sleep after headers with proof get gossiped
This commit is contained in:
Kim De Mey 2024-11-06 11:10:31 +01:00 committed by GitHub
parent 6374bfb39c
commit cff7091826
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 33 additions and 18 deletions

View File

@ -14,7 +14,7 @@ import
chronos,
stew/byteutils,
eth/p2p/discoveryv5/random2,
eth/keys,
eth/common/keys,
../common/common_types,
../rpc/portal_rpc_client,
../rpc/eth_rpc_client,
@ -79,7 +79,8 @@ proc withRetries[A](
if tries > numRetries:
# if we reached max number of retries fail
let msg =
"Call failed with msg: " & exc.msg & ", for node with idx: " & $nodeIdx
"Call failed with msg: " & exc.msg & ", for node with idx: " & $nodeIdx &
", after " & $tries & " tries."
raise newException(ValueError, msg)
inc tries
@ -94,7 +95,7 @@ proc retryUntil[A](
f: FutureCallback[A], c: CheckCallback[A], checkFailMessage: string, nodeIdx: int
): Future[A] =
# some reasonable limits, which will cause waits as: 1, 2, 4, 8, 16, 32 seconds
return withRetries(f, c, 2, seconds(1), checkFailMessage, nodeIdx)
return withRetries(f, c, 3, seconds(1), checkFailMessage, nodeIdx)
# Note:
# When doing json-rpc requests following `RpcPostError` can occur:
@ -261,9 +262,20 @@ procSuite "Portal testnet tests":
# Gossiping all block headers with proof first, as bodies and receipts
# require them for validation.
for (content, contentKey) in blockHeadersWithProof:
discard
(await clients[0].portal_historyGossip(content.toHex(), contentKey.toHex()))
for (contentKey, contentValue) in blockHeadersWithProof:
discard (
await clients[0].portal_historyGossip(contentKey.toHex(), contentValue.toHex())
)
# TODO: Fix iteration order: Because the blockData gets parsed into a
# BlockDataTable, iterating over this result in gossiping the block bodies
# and receipts of block in a different order than the headers.
# Because of this, block bodies and receipts for block
# 0x6251d65b8a8668efabe2f89c96a5b6332d83b3bbe585089ea6b2ab9b6754f5e9
# come right after the headers with proof. This is likely to cause validation
# failures on the nodes, as the block bodies and receipts require the header
# to get validated.
await sleepAsync(seconds(1))
# Gossiping all block bodies and receipts.
for b in blocks(blockData, false):

View File

@ -54,7 +54,19 @@ proc stop(hn: HistoryNode) {.async.} =
await hn.discoveryProtocol.closeWait()
proc containsId(hn: HistoryNode, contentId: ContentId): bool =
return hn.historyNetwork.contentDB.contains(contentId)
hn.historyNetwork.contentDB.contains(contentId)
proc checkContainsIdWithRetry(
historyNode: HistoryNode, id: ContentId
) {.async: (raises: [CancelledError]).} =
var res = false
for i in 0 .. 50:
res = historyNode.containsId(id)
if res:
break
await sleepAsync(10.milliseconds)
check res
proc createEmptyHeaders(fromNum: int, toNum: int): seq[Header] =
var headers: seq[Header]
@ -216,17 +228,10 @@ procSuite "History Content Network":
while not historyNode2.historyNetwork.contentQueue.empty():
await sleepAsync(1.milliseconds)
# Note: It seems something changed in chronos, causing different behavior.
# Seems that validateContent called through processContentLoop used to
# run immediatly in case of a "non async shortpath". This is no longer the
# case and causes the content not yet to be validated and thus stored at
# this step. Add an await here so that the store can happen.
await sleepAsync(100.milliseconds)
for i, contentKV in contentKVs:
let id = toContentId(contentKV.contentKey)
if i < len(contentKVs) - 1:
check historyNode2.containsId(id) == true
await historyNode2.checkContainsIdWithRetry(id)
else:
check historyNode2.containsId(id) == false
@ -283,11 +288,9 @@ procSuite "History Content Network":
while not historyNode2.historyNetwork.contentQueue.empty():
await sleepAsync(1.milliseconds)
await sleepAsync(100.milliseconds)
for contentKV in contentKVs:
let id = toContentId(contentKV.contentKey)
check historyNode2.containsId(id) == true
await historyNode2.checkContainsIdWithRetry(id)
await historyNode1.stop()
await historyNode2.stop()