From 03d6a1a9343bb294024611602809194526ec7669 Mon Sep 17 00:00:00 2001
From: Etan Kissling <etan@status.im>
Date: Fri, 12 Aug 2022 15:51:33 +0200
Subject: [PATCH] track request chunk size across EL reconnects (#3960)

When the EL connection is interrupted, deposits are once more requested
in chunks of 5000 blocks. This is a problem when the response takes over
a minute to produce and consistently times out as followup requests with
lower chunk sizes may no longer work after a request was canceled, e.g.,
when using Geth with websockets. By keeping track of `blocksPerRequest`
across EL reconnections, it is possible to recover from this by avoiding
to continuously repeat the initial request with the full 5000 blocks.
Also cleans up one more "retry of retry" instance; `DataProviderTimeout`
is a `CatchableError` and already handled by the existing retry logic.
---
 beacon_chain/eth1/eth1_monitor.nim | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/beacon_chain/eth1/eth1_monitor.nim b/beacon_chain/eth1/eth1_monitor.nim
index 2100a10b7..1e2c25c5d 100644
--- a/beacon_chain/eth1/eth1_monitor.nim
+++ b/beacon_chain/eth1/eth1_monitor.nim
@@ -62,6 +62,8 @@ const
   hasDepositRootChecks = defined(has_deposit_root_checks)
   hasGenesisDetection* = defined(has_genesis_detection)
 
+  targetBlocksPerLogsRequest = 5000'u64  # This is roughly a day of Eth1 blocks
+
 type
   Eth1BlockNumber* = uint64
   Eth1BlockTimestamp* = uint64
@@ -125,6 +127,7 @@ type
     depositContractAddress*: Eth1Address
     forcePolling: bool
     jwtSecret: Option[seq[byte]]
+    blocksPerLogsRequest: uint64
 
     dataProvider: Web3DataProviderRef
     latestEth1Block: Option[FullBlockId]
@@ -1047,7 +1050,8 @@ proc init*(T: type Eth1Monitor,
     eth1Network: eth1Network,
     eth1Progress: newAsyncEvent(),
     forcePolling: forcePolling,
-    jwtSecret: jwtSecret)
+    jwtSecret: jwtSecret,
+    blocksPerLogsRequest: targetBlocksPerLogsRequest)
 
 proc safeCancel(fut: var Future[void]) =
   if not fut.isNil and not fut.finished:
@@ -1149,18 +1153,12 @@ proc syncBlockRange(m: Eth1Monitor,
   while currentBlock <= toBlock:
     var
       depositLogs: JsonNode = nil
-      blocksPerRequest = 5000'u64 # This is roughly a day of Eth1 blocks
       maxBlockNumberRequested: Eth1BlockNumber
       backoff = 100
 
     while true:
-      maxBlockNumberRequested = min(toBlock, currentBlock + blocksPerRequest - 1)
-
-      template retryOrRaise(err: ref CatchableError) =
-        blocksPerRequest = blocksPerRequest div 2
-        if blocksPerRequest == 0:
-          raise err
-        continue
+      maxBlockNumberRequested =
+        min(toBlock, currentBlock + m.blocksPerLogsRequest - 1)
 
       debug "Obtaining deposit log events",
             fromBlock = currentBlock,
@@ -1177,15 +1175,22 @@ proc syncBlockRange(m: Eth1Monitor,
           toBlock = some blockId(maxBlockNumberRequested))
 
         depositLogs = try:
-          # Downloading large amounts of deposits can be quite slow
+          # Downloading large amounts of deposits may take several minutes
           awaitWithTimeout(jsonLogsFut, web3Timeouts):
-            retryOrRaise newException(DataProviderTimeout,
+            raise newException(DataProviderTimeout,
               "Request time out while obtaining json logs")
         except CatchableError as err:
           debug "Request for deposit logs failed", err = err.msg
           inc failed_web3_requests
           backoff = (backoff * 3) div 2
-          retryOrRaise err
+          m.blocksPerLogsRequest = m.blocksPerLogsRequest div 2
+          if m.blocksPerLogsRequest == 0:
+            m.blocksPerLogsRequest = 1
+            raise err
+          continue
+        m.blocksPerLogsRequest = min(
+          (m.blocksPerLogsRequest * 3 + 1) div 2,
+          targetBlocksPerLogsRequest)
 
       currentBlock = maxBlockNumberRequested + 1
       break