mirror of
https://github.com/status-im/nimbus-eth2.git
synced 2025-01-23 04:50:59 +00:00
deal with a temporary loss of network connectivity (#1354)
* don't kill the program if not connected to a bootstrap node within 30 seconds * recover faster from loss of network connectivity * connectWorker(): sleep 1s between dials * launch_local_testnet.sh: increase BOOTSTRAP_TIMEOUT * don't use metric value in program logic * refactor some ungainly variable names
This commit is contained in:
parent
e0a18a3105
commit
c47532f2b0
@ -23,7 +23,7 @@ import
|
|||||||
# Beacon node modules
|
# Beacon node modules
|
||||||
version, conf, eth2_discovery, libp2p_json_serialization, conf,
|
version, conf, eth2_discovery, libp2p_json_serialization, conf,
|
||||||
ssz/ssz_serialization,
|
ssz/ssz_serialization,
|
||||||
peer_pool, spec/[datatypes, network]
|
peer_pool, spec/[datatypes, network], ./time
|
||||||
|
|
||||||
export
|
export
|
||||||
version, multiaddress, peer_pool, peerinfo, p2pProtocol,
|
version, multiaddress, peer_pool, peerinfo, p2pProtocol,
|
||||||
@ -205,9 +205,9 @@ const
|
|||||||
ConcurrentConnections* = 10
|
ConcurrentConnections* = 10
|
||||||
## Maximum number of active concurrent connection requests.
|
## Maximum number of active concurrent connection requests.
|
||||||
|
|
||||||
SeenTableTimeTimeout* = 10.minutes
|
SeenTableTimeTimeout* = 1.minutes
|
||||||
## Seen period of time for timeout connections
|
## Seen period of time for timeout connections
|
||||||
SeenTableTimeDeadPeer* = 10.minutes
|
SeenTableTimeDeadPeer* = 1.minutes
|
||||||
## Period of time for dead peers.
|
## Period of time for dead peers.
|
||||||
SeenTableTimeIrrelevantNetwork* = 24.hours
|
SeenTableTimeIrrelevantNetwork* = 24.hours
|
||||||
## Period of time for `IrrelevantNetwork` error reason.
|
## Period of time for `IrrelevantNetwork` error reason.
|
||||||
@ -216,6 +216,8 @@ const
|
|||||||
SeemTableTimeFaultOrError* = 10.minutes
|
SeemTableTimeFaultOrError* = 10.minutes
|
||||||
## Period of time for `FaultOnError` error reason.
|
## Period of time for `FaultOnError` error reason.
|
||||||
|
|
||||||
|
var successfullyDialledAPeer = false # used to show a warning
|
||||||
|
|
||||||
template neterr(kindParam: Eth2NetworkingErrorKind): auto =
|
template neterr(kindParam: Eth2NetworkingErrorKind): auto =
|
||||||
err(type(result), Eth2NetworkingError(kind: kindParam))
|
err(type(result), Eth2NetworkingError(kind: kindParam))
|
||||||
|
|
||||||
@ -724,43 +726,49 @@ proc dialPeer*(node: Eth2Node, peerInfo: PeerInfo) {.async.} =
|
|||||||
await performProtocolHandshakes(peer)
|
await performProtocolHandshakes(peer)
|
||||||
|
|
||||||
inc nbc_successful_dials
|
inc nbc_successful_dials
|
||||||
|
successfullyDialledAPeer = true
|
||||||
debug "Network handshakes completed"
|
debug "Network handshakes completed"
|
||||||
|
|
||||||
proc connectWorker(network: Eth2Node) {.async.} =
|
proc connectWorker(network: Eth2Node) {.async.} =
|
||||||
debug "Connection worker started"
|
debug "Connection worker started"
|
||||||
while true:
|
|
||||||
let pi = await network.connQueue.popFirst()
|
|
||||||
let r1 = network.peerPool.hasPeer(pi.peerId)
|
|
||||||
let r2 = network.isSeen(pi)
|
|
||||||
let r3 = network.connTable.hasKey(pi.peerId)
|
|
||||||
|
|
||||||
if not(r1) and not(r2) and not(r3):
|
while true:
|
||||||
network.connTable[pi.peerId] = pi
|
let
|
||||||
|
remotePeerInfo = await network.connQueue.popFirst()
|
||||||
|
peerPoolHasRemotePeer = network.peerPool.hasPeer(remotePeerInfo.peerId)
|
||||||
|
seenTableHasRemotePeer = network.isSeen(remotePeerInfo)
|
||||||
|
remotePeerAlreadyConnected = network.connTable.hasKey(remotePeerInfo.peerId)
|
||||||
|
|
||||||
|
if not(peerPoolHasRemotePeer) and not(seenTableHasRemotePeer) and not(remotePeerAlreadyConnected):
|
||||||
|
network.connTable[remotePeerInfo.peerId] = remotePeerInfo
|
||||||
try:
|
try:
|
||||||
# We trying to connect to peers which are not in PeerPool, SeenTable and
|
# We trying to connect to peers which are not in PeerPool, SeenTable and
|
||||||
# ConnTable.
|
# ConnTable.
|
||||||
var fut = network.dialPeer(pi)
|
var fut = network.dialPeer(remotePeerInfo)
|
||||||
# We discarding here just because we going to check future state, to avoid
|
# We discarding here just because we going to check future state, to avoid
|
||||||
# condition where connection happens and timeout reached.
|
# condition where connection happens and timeout reached.
|
||||||
let res = await withTimeout(fut, network.connectTimeout)
|
discard await withTimeout(fut, network.connectTimeout)
|
||||||
# We handling only timeout and errors, because successfull connections
|
# We handling only timeout and errors, because successfull connections
|
||||||
# will be stored in PeerPool.
|
# will be stored in PeerPool.
|
||||||
if fut.finished():
|
if fut.finished():
|
||||||
if fut.failed() and not(fut.cancelled()):
|
if fut.failed() and not(fut.cancelled()):
|
||||||
debug "Unable to establish connection with peer", peer = pi.id,
|
debug "Unable to establish connection with peer", peer = remotePeerInfo.id,
|
||||||
errMsg = fut.readError().msg
|
errMsg = fut.readError().msg
|
||||||
inc nbc_failed_dials
|
inc nbc_failed_dials
|
||||||
network.addSeen(pi, SeenTableTimeDeadPeer)
|
network.addSeen(remotePeerInfo, SeenTableTimeDeadPeer)
|
||||||
continue
|
continue
|
||||||
debug "Connection to remote peer timed out", peer = pi.id
|
debug "Connection to remote peer timed out", peer = remotePeerInfo.id
|
||||||
inc nbc_timeout_dials
|
inc nbc_timeout_dials
|
||||||
network.addSeen(pi, SeenTableTimeTimeout)
|
network.addSeen(remotePeerInfo, SeenTableTimeTimeout)
|
||||||
finally:
|
finally:
|
||||||
network.connTable.del(pi.peerId)
|
network.connTable.del(remotePeerInfo.peerId)
|
||||||
else:
|
else:
|
||||||
trace "Peer is already connected, connecting or already seen",
|
trace "Peer is already connected, connecting or already seen",
|
||||||
peer = pi.id, peer_pool_has_peer = $r1, seen_table_has_peer = $r2,
|
peer = remotePeerInfo.id, peer_pool_has_peer = $peerPoolHasRemotePeer, seen_table_has_peer = $seenTableHasRemotePeer,
|
||||||
connecting_peer = $r3, seen_table_size = len(network.seenTable)
|
connecting_peer = $remotePeerAlreadyConnected, seen_table_size = len(network.seenTable)
|
||||||
|
|
||||||
|
# Prevent (a purely theoretical) high CPU usage when losing connectivity.
|
||||||
|
await sleepAsync(1.seconds)
|
||||||
|
|
||||||
proc runDiscoveryLoop*(node: Eth2Node) {.async.} =
|
proc runDiscoveryLoop*(node: Eth2Node) {.async.} =
|
||||||
debug "Starting discovery loop"
|
debug "Starting discovery loop"
|
||||||
@ -1136,17 +1144,19 @@ proc announcedENR*(node: Eth2Node): enr.Record =
|
|||||||
proc shortForm*(id: KeyPair): string =
|
proc shortForm*(id: KeyPair): string =
|
||||||
$PeerID.init(id.pubkey)
|
$PeerID.init(id.pubkey)
|
||||||
|
|
||||||
|
let BOOTSTRAP_NODE_CHECK_INTERVAL = 30.seconds
|
||||||
|
proc checkIfConnectedToBootstrapNode(p: pointer) {.gcsafe.} =
|
||||||
|
# Keep showing warnings until we connect to at least one bootstrap node
|
||||||
|
# successfully, in order to allow detection of an invalid configuration.
|
||||||
|
let node = cast[Eth2Node](p)
|
||||||
|
if node.discovery.bootstrapRecords.len > 0 and not successfullyDialledAPeer:
|
||||||
|
warn "Failed to connect to any bootstrap node",
|
||||||
|
bootstrapEnrs = node.discovery.bootstrapRecords
|
||||||
|
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, p)
|
||||||
|
|
||||||
proc startLookingForPeers*(node: Eth2Node) {.async.} =
|
proc startLookingForPeers*(node: Eth2Node) {.async.} =
|
||||||
await node.start()
|
await node.start()
|
||||||
|
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, node[].addr)
|
||||||
proc checkIfConnectedToBootstrapNode {.async.} =
|
|
||||||
await sleepAsync(30.seconds)
|
|
||||||
if node.discovery.bootstrapRecords.len > 0 and nbc_successful_dials.value == 0:
|
|
||||||
fatal "Failed to connect to any bootstrap node. Quitting",
|
|
||||||
bootstrapEnrs = node.discovery.bootstrapRecords
|
|
||||||
quit 1
|
|
||||||
|
|
||||||
traceAsyncErrors checkIfConnectedToBootstrapNode()
|
|
||||||
|
|
||||||
func peersCount*(node: Eth2Node): int =
|
func peersCount*(node: Eth2Node): int =
|
||||||
len(node.peerPool)
|
len(node.peerPool)
|
||||||
|
@ -147,7 +147,7 @@ $MAKE LOG_LEVEL="${LOG_LEVEL}" NIMFLAGS="-d:insecure -d:testnet_servers_image ${
|
|||||||
PIDS=""
|
PIDS=""
|
||||||
WEB3_ARG=""
|
WEB3_ARG=""
|
||||||
STATE_SNAPSHOT_ARG=""
|
STATE_SNAPSHOT_ARG=""
|
||||||
BOOTSTRAP_TIMEOUT=10 # in seconds
|
BOOTSTRAP_TIMEOUT=30 # in seconds
|
||||||
DEPOSIT_CONTRACT_ADDRESS="0x0000000000000000000000000000000000000000"
|
DEPOSIT_CONTRACT_ADDRESS="0x0000000000000000000000000000000000000000"
|
||||||
DEPOSIT_CONTRACT_BLOCK="0x0000000000000000000000000000000000000000000000000000000000000000"
|
DEPOSIT_CONTRACT_BLOCK="0x0000000000000000000000000000000000000000000000000000000000000000"
|
||||||
NETWORK_METADATA_FILE="${DATA_DIR}/network.json"
|
NETWORK_METADATA_FILE="${DATA_DIR}/network.json"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user