Don't report very brief EL connection interruptions on user-visible log levels (#4960)

This commit is contained in:
zah 2023-05-15 23:40:47 +03:00 committed by GitHub
parent 748be8b67b
commit 9b9c58c507
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 44 additions and 17 deletions

View File

@ -91,6 +91,10 @@ const
# https://github.com/ethereum/execution-apis/blob/v1.0.0-beta.3/src/engine/experimental/blob-extension.md#request-2
GETBLOBS_TIMEOUT = 1.seconds
connectionStateChangeHysteresisThreshold = 15
## How many unsuccesful/successful requests we must see
## before declaring the connection as degraded/restored
type
Eth1BlockNumber* = uint64
Eth1BlockTimestamp* = uint64
@ -207,6 +211,7 @@ type
## exchange.
state: ConnectionState
hysteresisCounter: int
depositContractSyncStatus: DepositContractSyncStatus
## Are we sure that this EL has synced the deposit contract?
@ -280,29 +285,56 @@ declareCounter engine_api_last_minute_forkchoice_updates_sent,
"Number of last minute requests to the forkchoiceUpdated Engine API end-point just before block proposals",
labels = ["url"]
proc close(connection: ELConnection): Future[void] {.async.} =
if connection.web3.isSome:
awaitWithTimeout(connection.web3.get.close(), 30.seconds):
debug "Failed to close data provider in time"
proc increaseCounterTowardsStateChange(connection: ELConnection): bool =
result = connection.hysteresisCounter >= connectionStateChangeHysteresisThreshold
if result:
connection.hysteresisCounter = 0
else:
inc connection.hysteresisCounter
proc decreaseCounterTowardsStateChange(connection: ELConnection) =
if connection.hysteresisCounter > 0:
# While we increase the counter by 1, we decreate it by 20% in order
# to require a steady and affirmative change instead of allowing
# the counter to drift very slowly in one direction when the ratio
# between success and failure is roughly 50:50%
connection.hysteresisCounter = connection.hysteresisCounter div 5
proc setDegradedState(connection: ELConnection,
requestName: string,
statusCode: int, errMsg: string) =
debug "Failed EL Request", requestName, statusCode, err = errMsg
case connection.state
of NeverTested, Working:
if connection.increaseCounterTowardsStateChange():
warn "Connection to EL node degraded",
url = url(connection.engineUrl),
failedRequest = requestName,
statusCode, err = errMsg
of Degraded:
discard
reset connection.web3
connection.state = Degraded
asyncSpawn connection.close()
connection.web3 = none[Web3]()
of Degraded:
connection.decreaseCounterTowardsStateChange()
proc setWorkingState(connection: ELConnection) =
case connection.state
of Degraded:
if connection.increaseCounterTowardsStateChange():
info "Connection to EL node restored",
url = url(connection.engineUrl)
of NeverTested, Working:
discard
connection.state = Working
of NeverTested, Working:
connection.decreaseCounterTowardsStateChange()
proc trackEngineApiRequest(connection: ELConnection,
request: FutureBase, requestName: string,
@ -658,11 +690,6 @@ func toVoteData(blk: Eth1Block): Eth1Data =
func hash*(x: Eth1Data): Hash =
hash(x.block_hash)
proc close(connection: ELConnection): Future[void] {.async.} =
if connection.web3.isSome:
awaitWithTimeout(connection.web3.get.close(), 30.seconds):
debug "Failed to close data provider in time"
func isConnected(connection: ELConnection): bool =
connection.web3.isSome