From 9b9c58c507713f3f5ebe9f4408e3bfd85314b247 Mon Sep 17 00:00:00 2001 From: zah Date: Mon, 15 May 2023 23:40:47 +0300 Subject: [PATCH] Don't report very brief EL connection interruptions on user-visible log levels (#4960) --- beacon_chain/el/el_manager.nim | 61 ++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/beacon_chain/el/el_manager.nim b/beacon_chain/el/el_manager.nim index 3ce7a7296..e49811844 100644 --- a/beacon_chain/el/el_manager.nim +++ b/beacon_chain/el/el_manager.nim @@ -91,6 +91,10 @@ const # https://github.com/ethereum/execution-apis/blob/v1.0.0-beta.3/src/engine/experimental/blob-extension.md#request-2 GETBLOBS_TIMEOUT = 1.seconds + connectionStateChangeHysteresisThreshold = 15 + ## How many unsuccesful/successful requests we must see + ## before declaring the connection as degraded/restored + type Eth1BlockNumber* = uint64 Eth1BlockTimestamp* = uint64 @@ -207,6 +211,7 @@ type ## exchange. state: ConnectionState + hysteresisCounter: int depositContractSyncStatus: DepositContractSyncStatus ## Are we sure that this EL has synced the deposit contract? @@ -280,29 +285,56 @@ declareCounter engine_api_last_minute_forkchoice_updates_sent, "Number of last minute requests to the forkchoiceUpdated Engine API end-point just before block proposals", labels = ["url"] +proc close(connection: ELConnection): Future[void] {.async.} = + if connection.web3.isSome: + awaitWithTimeout(connection.web3.get.close(), 30.seconds): + debug "Failed to close data provider in time" + +proc increaseCounterTowardsStateChange(connection: ELConnection): bool = + result = connection.hysteresisCounter >= connectionStateChangeHysteresisThreshold + if result: + connection.hysteresisCounter = 0 + else: + inc connection.hysteresisCounter + +proc decreaseCounterTowardsStateChange(connection: ELConnection) = + if connection.hysteresisCounter > 0: + # While we increase the counter by 1, we decreate it by 20% in order + # to require a steady and affirmative change instead of allowing + # the counter to drift very slowly in one direction when the ratio + # between success and failure is roughly 50:50% + connection.hysteresisCounter = connection.hysteresisCounter div 5 + proc setDegradedState(connection: ELConnection, requestName: string, statusCode: int, errMsg: string) = + debug "Failed EL Request", requestName, statusCode, err = errMsg + case connection.state of NeverTested, Working: - warn "Connection to EL node degraded", - url = url(connection.engineUrl), - failedRequest = requestName, - statusCode, err = errMsg - of Degraded: - discard + if connection.increaseCounterTowardsStateChange(): + warn "Connection to EL node degraded", + url = url(connection.engineUrl), + failedRequest = requestName, + statusCode, err = errMsg - reset connection.web3 - connection.state = Degraded + connection.state = Degraded + + asyncSpawn connection.close() + connection.web3 = none[Web3]() + of Degraded: + connection.decreaseCounterTowardsStateChange() proc setWorkingState(connection: ELConnection) = case connection.state of Degraded: - info "Connection to EL node restored", - url = url(connection.engineUrl) + if connection.increaseCounterTowardsStateChange(): + info "Connection to EL node restored", + url = url(connection.engineUrl) + + connection.state = Working of NeverTested, Working: - discard - connection.state = Working + connection.decreaseCounterTowardsStateChange() proc trackEngineApiRequest(connection: ELConnection, request: FutureBase, requestName: string, @@ -658,11 +690,6 @@ func toVoteData(blk: Eth1Block): Eth1Data = func hash*(x: Eth1Data): Hash = hash(x.block_hash) -proc close(connection: ELConnection): Future[void] {.async.} = - if connection.web3.isSome: - awaitWithTimeout(connection.web3.get.close(), 30.seconds): - debug "Failed to close data provider in time" - func isConnected(connection: ELConnection): bool = connection.web3.isSome