Fix testnet tests (#1176)

* Testnet improvements

Increase timeout for reading
Add more logs
Offer endpoint can fail due to talkReq timeout, to avoid
test failure, retry it few times until success.
This commit is contained in:
KonradStaniec 2022-07-29 14:24:07 +02:00 committed by GitHub
parent e3cabaff7f
commit a1253c67bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 76 additions and 38 deletions

View File

@ -280,21 +280,21 @@ proc getBlockHeader*(
let headerFromDb = n.getContentFromDb(BlockHeader, contentId)
if headerFromDb.isSome():
info "Fetched block header from database", hash
info "Fetched block header from database", hash, contentKey = keyEncoded
return headerFromDb
for i in 0..<requestRetries:
let headerContentLookup =
await n.portalProtocol.contentLookup(keyEncoded, contentId)
if headerContentLookup.isNone():
warn "Failed fetching block header from the network", hash
warn "Failed fetching block header from the network", hash, contentKey = keyEncoded
return none(BlockHeader)
let headerContent = headerContentLookup.unsafeGet()
let res = validateBlockHeaderBytes(headerContent.content, hash)
if res.isOk():
info "Fetched block header from the network", hash
info "Fetched block header from the network", hash, contentKey = keyEncoded
# Content is valid we can propagate it to interested peers
n.portalProtocol.triggerPoke(
headerContent.nodesInterestedInContent,
@ -306,7 +306,7 @@ proc getBlockHeader*(
return some(res.get())
else:
warn "Validation of block header failed", err = res.error, hash
warn "Validation of block header failed", err = res.error, hash, contentKey = keyEncoded
# Headers were requested `requestRetries` times and all failed on validation
return none(BlockHeader)
@ -324,7 +324,7 @@ proc getBlockBody*(
bodyFromDb = n.getContentFromDb(BlockBody, contentId)
if bodyFromDb.isSome():
info "Fetched block body from database", hash
info "Fetched block body from database", hash, contentKey = keyEncoded
return bodyFromDb
for i in 0..<requestRetries:
@ -332,7 +332,7 @@ proc getBlockBody*(
await n.portalProtocol.contentLookup(keyEncoded, contentId)
if bodyContentLookup.isNone():
warn "Failed fetching block body from the network", hash
warn "Failed fetching block body from the network", hash, contentKey = keyEncoded
return none(BlockBody)
let bodyContent = bodyContentLookup.unsafeGet()
@ -340,7 +340,7 @@ proc getBlockBody*(
let res = validateBlockBodyBytes(
bodyContent.content, header.txRoot, header.ommersHash)
if res.isOk():
info "Fetched block body from the network", hash
info "Fetched block body from the network", hash, contentKey = keyEncoded
# body is valid, propagate it to interested peers
n.portalProtocol.triggerPoke(
@ -353,15 +353,18 @@ proc getBlockBody*(
return some(res.get())
else:
warn "Validation of block body failed", err = res.error, hash
warn "Validation of block body failed", err = res.error, hash, contentKey = keyEncoded
return none(BlockBody)
proc getBlock*(
n: HistoryNetwork, chainId: uint16, hash: BlockHash):
Future[Option[Block]] {.async.} =
debug "Trying to retrieve block with hash", hash
let headerOpt = await n.getBlockHeader(chainId, hash)
if headerOpt.isNone():
warn "Failed to get header when getting block with hash", hash
# Cannot validate block without header.
return none(Block)
@ -370,6 +373,7 @@ proc getBlock*(
let bodyOpt = await n.getBlockBody(chainId, hash, header)
if bodyOpt.isNone():
warn "Failed to get body when gettin block with hash", hash
return none(Block)
let body = bodyOpt.unsafeGet()
@ -397,14 +401,14 @@ proc getReceipts*(
let receiptsContentLookup =
await n.portalProtocol.contentLookup(keyEncoded, contentId)
if receiptsContentLookup.isNone():
warn "Failed fetching receipts from the network", hash
warn "Failed fetching receipts from the network", hash, contentKey = keyEncoded
return none(seq[Receipt])
let receiptsContent = receiptsContentLookup.unsafeGet()
let res = validateReceiptsBytes(receiptsContent.content, header.receiptRoot)
if res.isOk():
info "Fetched receipts from the network", hash
info "Fetched receipts from the network", hash, contentKey = keyEncoded
let receipts = res.get()
@ -419,7 +423,7 @@ proc getReceipts*(
return some(res.get())
else:
warn "Validation of receipts failed", err = res.error, hash
warn "Validation of receipts failed", err = res.error, hash, contentKey = keyEncoded
return none(seq[Receipt])

View File

@ -588,6 +588,7 @@ proc findNodes*(
proc findContent*(p: PortalProtocol, dst: Node, contentKey: ByteList):
Future[PortalResult[FoundContent]] {.async.} =
let contentMessageResponse = await p.findContentImpl(dst, contentKey)
if contentMessageResponse.isOk():
@ -672,6 +673,10 @@ proc findContent*(p: PortalProtocol, dst: Node, contentKey: ByteList):
return ok(FoundContent(src: dst, kind: Nodes, nodes: verifiedNodes))
else:
return err("Content message returned invalid ENRs")
else:
warn "FindContent failed due to find content request failure ", error = contentMessageResponse.error, contentKey = contentKey
return err("No content response")
proc getContentKeys(o: OfferRequest): ContentKeysList =
case o.kind
@ -703,6 +708,9 @@ proc offer(p: PortalProtocol, o: OfferRequest):
## by the cleanup process before it will be transferred, so this way does not
## guarantee content transfer.
let contentKeys = getContentKeys(o)
debug "Offering content", contentKeys = contentKeys
portal_content_keys_offered.observe(contentKeys.len().int64)
let acceptMessageResponse = await p.offerImpl(o.dst, contentKeys)
@ -726,6 +734,7 @@ proc offer(p: PortalProtocol, o: OfferRequest):
let acceptedKeysAmount = m.contentKeys.countOnes()
portal_content_keys_accepted.observe(acceptedKeysAmount.int64)
if acceptedKeysAmount == 0:
debug "No content acceppted", contentKeys = contentKeys
# Don't open an uTP stream if no content was requested
return ok()
@ -745,7 +754,7 @@ proc offer(p: PortalProtocol, o: OfferRequest):
if connectionResult.isErr():
debug "Utp connection error while trying to offer content",
error = connectionResult.error
error = connectionResult.error, contentKeys = contentKeys
return err("Error connecting uTP socket")
let socket = connectionResult.get()
@ -765,7 +774,7 @@ proc offer(p: PortalProtocol, o: OfferRequest):
let dataWritten = await socket.write(output.getOutput)
if dataWritten.isErr:
debug "Error writing requested data", error = dataWritten.error
debug "Error writing requested data", error = dataWritten.error, contentKeys = contentKeys
# No point in trying to continue writing data
socket.close()
return err("Error writing requested data")
@ -790,14 +799,17 @@ proc offer(p: PortalProtocol, o: OfferRequest):
let dataWritten = await socket.write(output.getOutput)
if dataWritten.isErr:
debug "Error writing requested data", error = dataWritten.error
debug "Error writing requested data", error = dataWritten.error, contentKeys = contentKeys
# No point in trying to continue writing data
socket.close()
return err("Error writing requested data")
debug "Content successfully offered", contentKeys = contentKeys
await socket.closeWait()
return ok()
else:
warn "Offer failed due to accept request failure ", error = acceptMessageResponse.error, contentKeys = contentKeys
return err("No accept response")
proc offer*(p: PortalProtocol, dst: Node, contentKeys: ContentKeysList):
@ -1096,7 +1108,6 @@ proc neighborhoodGossip*(
for i, contentItem in content:
let contentInfo =
ContentInfo(contentKey: contentKeys[i], content: contentItem)
discard contentList.add(contentInfo)
# Just taking the first content item as target id.
@ -1142,7 +1153,6 @@ proc neighborhoodGossip*(
else: # use looked up nodes for gossip
portal_gossip_with_lookup.inc(labelValues = [$p.protocolId])
let closestNodes = await p.lookup(NodeId(contentId))
for node in closestNodes[0..<min(closestNodes.len, maxGossipNodes)]:
# Note: opportunistically not checking if the radius of the node is known
# and thus if the node is in radius with the content. Reason is, these

View File

@ -24,7 +24,7 @@ logScope:
const
utpProtocolId* = "utp".toBytes()
defaultConnectionTimeout = 5.seconds
defaultContentReadTimeout = 2.seconds
defaultContentReadTimeout = 15.seconds
# TalkReq message is used as transport for uTP. It is assumed here that Portal
# protocol messages were exchanged before sending uTP over discv5 data. This
@ -102,6 +102,9 @@ proc addContentOffer*(
# uTP protocol uses BE for all values in the header, incl. connection id.
let id = uint16.fromBytesBE(connectionId)
debug "Register new incoming offer", contentKeys
let contentOffer = ContentOffer(
connectionId: id,
nodeId: nodeId,
@ -244,12 +247,12 @@ proc readContentOffer(
else:
# Invalid data, stop reading content, but still process data received
# so far.
debug "Reading content item failed, content offer failed"
debug "Reading content item failed, content offer failed", contentKeys = offer.contentKeys
break
else:
# Read timed out, stop further reading, but still process data received
# so far.
debug "Reading data from socket timed out, content offer failed"
debug "Reading data from socket timed out, content offer failed", contentKeys = offer.contentKeys
break
if socket.atEof():

View File

@ -9,6 +9,7 @@ import
os,
std/sequtils,
unittest2, testutils, confutils, chronos,
stew/byteutils,
eth/p2p/discoveryv5/random2, eth/keys,
../../nimbus/rpc/[hexstrings, rpc_types],
../rpc/portal_rpc_client,
@ -52,7 +53,8 @@ proc withRetries[A](
check: CheckCallback[A],
numRetries: int,
initialWait: Duration,
checkFailMessage: string): Future[A] {.async.} =
checkFailMessage: string,
nodeIdx: int): Future[A] {.async.} =
## Retries given future callback until either:
## it returns successfuly and given check is true
## or
@ -71,7 +73,8 @@ proc withRetries[A](
except CatchableError as exc:
if tries > numRetries:
# if we reached max number of retries fail
raise exc
let msg = "Call failed with msg: " & exc.msg & ", for node with idx: " & $nodeIdx
raise newException(ValueError, msg)
inc tries
# wait before new retry
@ -81,12 +84,13 @@ proc withRetries[A](
# Sometimes we need to wait till data will be propagated over the network.
# To avoid long sleeps, this combinator can be used to retry some calls until
# success or until some condition hold (or both)
proc retryUntilDataPropagated[A](
proc retryUntil[A](
f: FutureCallback[A],
c: CheckCallback[A],
checkFailMessage: string): Future[A] =
# some reasonable limits, which will cause waits as: 1, 2, 4, 8, 16 seconds
return withRetries(f, c, 5, seconds(1), checkFailMessage)
checkFailMessage: string,
nodeIdx: int): Future[A] =
# some reasonable limits, which will cause waits as: 1, 2, 4, 8, 16, 32 seconds
return withRetries(f, c, 6, seconds(1), checkFailMessage, nodeIdx)
# Note:
# When doing json-rpc requests following `RpcPostError` can occur:
@ -237,7 +241,7 @@ procSuite "Portal testnet tests":
let blockData = readBlockDataTable(dataFile)
check blockData.isOk()
for client in clients:
for i, client in clients:
# Note: Once there is the Canonical Indices Network, we don't need to
# access this file anymore here for the block hashes.
for hash in blockData.get().blockHashes():
@ -246,7 +250,7 @@ procSuite "Portal testnet tests":
# add a json-rpc debug proc that returns whether the offer queue is empty or
# not. And then poll every node until all nodes have an empty queue.
let content = await retryUntilDataPropagated(
let content = await retryUntil(
proc (): Future[Option[BlockObject]] {.async.} =
try:
let res = await client.eth_getBlockByHash(hash.ethHashStr(), false)
@ -257,7 +261,8 @@ procSuite "Portal testnet tests":
raise exc
,
proc (mc: Option[BlockObject]): bool = return mc.isSome(),
"Did not receive expected Block with hash " & $hash
"Did not receive expected Block with hash " & hash.data.toHex(),
i
)
check content.isSome()
let blockObj = content.get()
@ -272,7 +277,7 @@ procSuite "Portal testnet tests":
blockHash: some(hash)
)
let logs = await retryUntilDataPropagated(
let logs = await retryUntil(
proc (): Future[seq[FilterLog]] {.async.} =
try:
let res = await client.eth_getLogs(filterOptions)
@ -283,7 +288,8 @@ procSuite "Portal testnet tests":
raise exc
,
proc (mc: seq[FilterLog]): bool = return true,
""
"",
i
)
for l in logs:
@ -334,14 +340,27 @@ procSuite "Portal testnet tests":
# offer content to node 1..63
for i in 1..lastNodeIdx:
let receipientId = nodeInfos[i].nodeId
check (await clients[0].portal_history_offerContentInNodeRange(tempDbPath, receipientId, 64, 0))
let offerResponse = await retryUntil(
proc (): Future[bool] {.async.} =
try:
let res = await clients[0].portal_history_offerContentInNodeRange(tempDbPath, receipientId, 64, 0)
await clients[0].close()
return res
except CatchableError as exc:
await clients[0].close()
raise exc
,
proc (os: bool): bool = return os,
"Offer failed",
i
)
check offerResponse
for client in clients:
for i, client in clients:
# Note: Once there is the Canonical Indices Network, we don't need to
# access this file anymore here for the block hashes.
for hash in bd.blockHashes():
let content = await retryUntilDataPropagated(
let content = await retryUntil(
proc (): Future[Option[BlockObject]] {.async.} =
try:
let res = await client.eth_getBlockByHash(hash.ethHashStr(), false)
@ -352,7 +371,8 @@ procSuite "Portal testnet tests":
raise exc
,
proc (mc: Option[BlockObject]): bool = return mc.isSome(),
"Did not receive expected Block with hash " & $hash
"Did not receive expected Block with hash " & hash.data.toHex(),
i
)
check content.isSome()
@ -403,11 +423,11 @@ procSuite "Portal testnet tests":
check (await clients[0].portal_history_depthContentPropagate(tempDbPath, 64))
await clients[0].close()
for client in clients:
for i, client in clients:
# Note: Once there is the Canonical Indices Network, we don't need to
# access this file anymore here for the block hashes.
for hash in bd.blockHashes():
let content = await retryUntilDataPropagated(
let content = await retryUntil(
proc (): Future[Option[BlockObject]] {.async.} =
try:
let res = await client.eth_getBlockByHash(hash.ethHashStr(), false)
@ -418,7 +438,8 @@ procSuite "Portal testnet tests":
raise exc
,
proc (mc: Option[BlockObject]): bool = return mc.isSome(),
"Did not receive expected Block with hash " & $hash
"Did not receive expected Block with hash " & hash.data.toHex(),
i
)
check content.isSome()