track request chunk size across EL reconnects (#3960)

When the EL connection is interrupted, deposits are once more requested
in chunks of 5000 blocks. This is a problem when the response takes over
a minute to produce and consistently times out as followup requests with
lower chunk sizes may no longer work after a request was canceled, e.g.,
when using Geth with websockets. By keeping track of `blocksPerRequest`
across EL reconnections, it is possible to recover from this by avoiding
to continuously repeat the initial request with the full 5000 blocks.
Also cleans up one more "retry of retry" instance; `DataProviderTimeout`
is a `CatchableError` and already handled by the existing retry logic.
This commit is contained in:
Etan Kissling 2022-08-12 15:51:33 +02:00 committed by GitHub
parent 98a533c781
commit 03d6a1a934
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 17 additions and 12 deletions

View File

@ -62,6 +62,8 @@ const
hasDepositRootChecks = defined(has_deposit_root_checks) hasDepositRootChecks = defined(has_deposit_root_checks)
hasGenesisDetection* = defined(has_genesis_detection) hasGenesisDetection* = defined(has_genesis_detection)
targetBlocksPerLogsRequest = 5000'u64 # This is roughly a day of Eth1 blocks
type type
Eth1BlockNumber* = uint64 Eth1BlockNumber* = uint64
Eth1BlockTimestamp* = uint64 Eth1BlockTimestamp* = uint64
@ -125,6 +127,7 @@ type
depositContractAddress*: Eth1Address depositContractAddress*: Eth1Address
forcePolling: bool forcePolling: bool
jwtSecret: Option[seq[byte]] jwtSecret: Option[seq[byte]]
blocksPerLogsRequest: uint64
dataProvider: Web3DataProviderRef dataProvider: Web3DataProviderRef
latestEth1Block: Option[FullBlockId] latestEth1Block: Option[FullBlockId]
@ -1047,7 +1050,8 @@ proc init*(T: type Eth1Monitor,
eth1Network: eth1Network, eth1Network: eth1Network,
eth1Progress: newAsyncEvent(), eth1Progress: newAsyncEvent(),
forcePolling: forcePolling, forcePolling: forcePolling,
jwtSecret: jwtSecret) jwtSecret: jwtSecret,
blocksPerLogsRequest: targetBlocksPerLogsRequest)
proc safeCancel(fut: var Future[void]) = proc safeCancel(fut: var Future[void]) =
if not fut.isNil and not fut.finished: if not fut.isNil and not fut.finished:
@ -1149,18 +1153,12 @@ proc syncBlockRange(m: Eth1Monitor,
while currentBlock <= toBlock: while currentBlock <= toBlock:
var var
depositLogs: JsonNode = nil depositLogs: JsonNode = nil
blocksPerRequest = 5000'u64 # This is roughly a day of Eth1 blocks
maxBlockNumberRequested: Eth1BlockNumber maxBlockNumberRequested: Eth1BlockNumber
backoff = 100 backoff = 100
while true: while true:
maxBlockNumberRequested = min(toBlock, currentBlock + blocksPerRequest - 1) maxBlockNumberRequested =
min(toBlock, currentBlock + m.blocksPerLogsRequest - 1)
template retryOrRaise(err: ref CatchableError) =
blocksPerRequest = blocksPerRequest div 2
if blocksPerRequest == 0:
raise err
continue
debug "Obtaining deposit log events", debug "Obtaining deposit log events",
fromBlock = currentBlock, fromBlock = currentBlock,
@ -1177,15 +1175,22 @@ proc syncBlockRange(m: Eth1Monitor,
toBlock = some blockId(maxBlockNumberRequested)) toBlock = some blockId(maxBlockNumberRequested))
depositLogs = try: depositLogs = try:
# Downloading large amounts of deposits can be quite slow # Downloading large amounts of deposits may take several minutes
awaitWithTimeout(jsonLogsFut, web3Timeouts): awaitWithTimeout(jsonLogsFut, web3Timeouts):
retryOrRaise newException(DataProviderTimeout, raise newException(DataProviderTimeout,
"Request time out while obtaining json logs") "Request time out while obtaining json logs")
except CatchableError as err: except CatchableError as err:
debug "Request for deposit logs failed", err = err.msg debug "Request for deposit logs failed", err = err.msg
inc failed_web3_requests inc failed_web3_requests
backoff = (backoff * 3) div 2 backoff = (backoff * 3) div 2
retryOrRaise err m.blocksPerLogsRequest = m.blocksPerLogsRequest div 2
if m.blocksPerLogsRequest == 0:
m.blocksPerLogsRequest = 1
raise err
continue
m.blocksPerLogsRequest = min(
(m.blocksPerLogsRequest * 3 + 1) div 2,
targetBlocksPerLogsRequest)
currentBlock = maxBlockNumberRequested + 1 currentBlock = maxBlockNumberRequested + 1
break break