avoid producing blocks/attestations when out of sync
this is a temporary measure until we figure something better out - as it stands, we'll advance with empty slots and crash because all validators are out.
This commit is contained in:
parent
fd55096ec3
commit
90212eed2a
|
@ -19,6 +19,7 @@ const
|
||||||
dataDirValidators = "validators"
|
dataDirValidators = "validators"
|
||||||
genesisFile = "genesis.ssz"
|
genesisFile = "genesis.ssz"
|
||||||
hasPrompt = not defined(withoutPrompt)
|
hasPrompt = not defined(withoutPrompt)
|
||||||
|
maxEmptySlotCount = uint64(24*60*60) div SECONDS_PER_SLOT
|
||||||
|
|
||||||
# https://github.com/ethereum/eth2.0-metrics/blob/master/metrics.md#interop-metrics
|
# https://github.com/ethereum/eth2.0-metrics/blob/master/metrics.md#interop-metrics
|
||||||
declareGauge beacon_slot,
|
declareGauge beacon_slot,
|
||||||
|
@ -297,6 +298,33 @@ func getAttachedValidator(node: BeaconNode,
|
||||||
let validatorKey = state.validators[idx].pubkey
|
let validatorKey = state.validators[idx].pubkey
|
||||||
node.attachedValidators.getValidator(validatorKey)
|
node.attachedValidators.getValidator(validatorKey)
|
||||||
|
|
||||||
|
proc isSynced(node: BeaconNode, head: BlockRef): bool =
|
||||||
|
## TODO This function is here as a placeholder for some better heurestics to
|
||||||
|
## determine if we're in sync and should be producing blocks and
|
||||||
|
## attestations. Generally, the problem is that slot time keeps advancing
|
||||||
|
## even when there are no blocks being produced, so there's no way to
|
||||||
|
## distinguish validators geniunely going missing from the node not being
|
||||||
|
## well connected (during a network split or an internet outage for
|
||||||
|
## example). It would generally be correct to simply keep running as if
|
||||||
|
## we were the only legit node left alive, but then we run into issues:
|
||||||
|
## with enough many empty slots, the validator pool is emptied leading
|
||||||
|
## to empty committees and lots of empty slot processing that will be
|
||||||
|
## thrown away as soon as we're synced again.
|
||||||
|
|
||||||
|
let
|
||||||
|
# The slot we should be at, according to the clock
|
||||||
|
beaconTime = node.beaconClock.now()
|
||||||
|
wallSlot = beaconTime.toSlot()
|
||||||
|
|
||||||
|
# TODO if everyone follows this logic, the network will not recover from a
|
||||||
|
# halt: nobody will be producing blocks because everone expects someone
|
||||||
|
# else to do it
|
||||||
|
if wallSlot.afterGenesis and (wallSlot.slot > head.slot) and
|
||||||
|
(wallSlot.slot - head.slot) > maxEmptySlotCount:
|
||||||
|
false
|
||||||
|
else:
|
||||||
|
true
|
||||||
|
|
||||||
proc updateHead(node: BeaconNode, slot: Slot): BlockRef =
|
proc updateHead(node: BeaconNode, slot: Slot): BlockRef =
|
||||||
# Use head state for attestation resolution below
|
# Use head state for attestation resolution below
|
||||||
|
|
||||||
|
@ -484,11 +512,16 @@ proc onAttestation(node: BeaconNode, attestation: Attestation) =
|
||||||
# though - maybe we should use the state from the block pointed to by
|
# though - maybe we should use the state from the block pointed to by
|
||||||
# the attestation for some of the check? Consider interop with block
|
# the attestation for some of the check? Consider interop with block
|
||||||
# production!
|
# production!
|
||||||
let
|
if attestation.data.slot > head.blck.slot and
|
||||||
bs = BlockSlot(blck: head.blck, slot: wallSlot.slot)
|
(attestation.data.slot - head.blck.slot) > maxEmptySlotCount:
|
||||||
|
warn "Ignoring attestation, head block too old (out of sync?)",
|
||||||
|
attestationSlot = attestation.data.slot, headSlot = head.blck.slot
|
||||||
|
else:
|
||||||
|
let
|
||||||
|
bs = BlockSlot(blck: head.blck, slot: wallSlot.slot)
|
||||||
|
|
||||||
node.blockPool.withState(node.stateCache, bs):
|
node.blockPool.withState(node.stateCache, bs):
|
||||||
node.attestationPool.add(state, attestedBlock, attestation)
|
node.attestationPool.add(state, attestedBlock, attestation)
|
||||||
else:
|
else:
|
||||||
node.attestationPool.addUnresolved(attestation)
|
node.attestationPool.addUnresolved(attestation)
|
||||||
|
|
||||||
|
@ -697,65 +730,67 @@ proc onSlotStart(node: BeaconNode, lastSlot, scheduledSlot: Slot) {.gcsafe, asyn
|
||||||
# disappear naturally - risky because user is not aware,
|
# disappear naturally - risky because user is not aware,
|
||||||
# and might lose stake on canonical chain but "just works"
|
# and might lose stake on canonical chain but "just works"
|
||||||
# when reconnected..
|
# when reconnected..
|
||||||
# Right now, we keep going
|
if not node.isSynced(head):
|
||||||
|
warn "Node out of sync, skipping block and attestation production for this slot",
|
||||||
|
slot, headSlot = head.slot
|
||||||
|
else:
|
||||||
|
var curSlot = lastSlot + 1
|
||||||
|
while curSlot < slot:
|
||||||
|
# Timers may be delayed because we're busy processing, and we might have
|
||||||
|
# more work to do. We'll try to do so in an expedited way.
|
||||||
|
# TODO maybe even collect all work synchronously to avoid unnecessary
|
||||||
|
# state rewinds while waiting for async operations like validator
|
||||||
|
# signature..
|
||||||
|
notice "Catching up",
|
||||||
|
curSlot = shortLog(curSlot),
|
||||||
|
lastSlot = shortLog(lastSlot),
|
||||||
|
slot = shortLog(slot),
|
||||||
|
cat = "overload"
|
||||||
|
|
||||||
var curSlot = lastSlot + 1
|
# For every slot we're catching up, we'll propose then send
|
||||||
while curSlot < slot:
|
# attestations - head should normally be advancing along the same branch
|
||||||
# Timers may be delayed because we're busy processing, and we might have
|
# in this case
|
||||||
# more work to do. We'll try to do so in an expedited way.
|
# TODO what if we receive blocks / attestations while doing this work?
|
||||||
# TODO maybe even collect all work synchronously to avoid unnecessary
|
head = await handleProposal(node, head, curSlot)
|
||||||
# state rewinds while waiting for async operations like validator
|
|
||||||
# signature..
|
|
||||||
notice "Catching up",
|
|
||||||
curSlot = shortLog(curSlot),
|
|
||||||
lastSlot = shortLog(lastSlot),
|
|
||||||
slot = shortLog(slot),
|
|
||||||
cat = "overload"
|
|
||||||
|
|
||||||
# For every slot we're catching up, we'll propose then send
|
# For each slot we missed, we need to send out attestations - if we were
|
||||||
# attestations - head should normally be advancing along the same branch
|
# proposing during this time, we'll use the newly proposed head, else just
|
||||||
# in this case
|
# keep reusing the same - the attestation that goes out will actually
|
||||||
# TODO what if we receive blocks / attestations while doing this work?
|
# rewind the state to what it looked like at the time of that slot
|
||||||
head = await handleProposal(node, head, curSlot)
|
# TODO smells like there's an optimization opportunity here
|
||||||
|
handleAttestations(node, head, curSlot)
|
||||||
|
|
||||||
# For each slot we missed, we need to send out attestations - if we were
|
curSlot += 1
|
||||||
# proposing during this time, we'll use the newly proposed head, else just
|
|
||||||
# keep reusing the same - the attestation that goes out will actually
|
|
||||||
# rewind the state to what it looked like at the time of that slot
|
|
||||||
# TODO smells like there's an optimization opportunity here
|
|
||||||
handleAttestations(node, head, curSlot)
|
|
||||||
|
|
||||||
curSlot += 1
|
head = await handleProposal(node, head, slot)
|
||||||
|
|
||||||
head = await handleProposal(node, head, slot)
|
# We've been doing lots of work up until now which took time. Normally, we
|
||||||
|
# send out attestations at the slot mid-point, so we go back to the clock
|
||||||
|
# to see how much time we need to wait.
|
||||||
|
# TODO the beacon clock might jump here also. It's probably easier to complete
|
||||||
|
# the work for the whole slot using a monotonic clock instead, then deal
|
||||||
|
# with any clock discrepancies once only, at the start of slot timer
|
||||||
|
# processing..
|
||||||
|
let
|
||||||
|
attestationStart = node.beaconClock.fromNow(slot)
|
||||||
|
halfSlot = seconds(int64(SECONDS_PER_SLOT div 2))
|
||||||
|
|
||||||
# We've been doing lots of work up until now which took time. Normally, we
|
if attestationStart.inFuture or attestationStart.offset <= halfSlot:
|
||||||
# send out attestations at the slot mid-point, so we go back to the clock
|
let fromNow =
|
||||||
# to see how much time we need to wait.
|
if attestationStart.inFuture: attestationStart.offset + halfSlot
|
||||||
# TODO the beacon clock might jump here also. It's probably easier to complete
|
else: halfSlot - attestationStart.offset
|
||||||
# the work for the whole slot using a monotonic clock instead, then deal
|
|
||||||
# with any clock discrepancies once only, at the start of slot timer
|
|
||||||
# processing..
|
|
||||||
let
|
|
||||||
attestationStart = node.beaconClock.fromNow(slot)
|
|
||||||
halfSlot = seconds(int64(SECONDS_PER_SLOT div 2))
|
|
||||||
|
|
||||||
if attestationStart.inFuture or attestationStart.offset <= halfSlot:
|
trace "Waiting to send attestations",
|
||||||
let fromNow =
|
slot = shortLog(slot),
|
||||||
if attestationStart.inFuture: attestationStart.offset + halfSlot
|
fromNow = shortLog(fromNow),
|
||||||
else: halfSlot - attestationStart.offset
|
cat = "scheduling"
|
||||||
|
|
||||||
trace "Waiting to send attestations",
|
await sleepAsync(fromNow)
|
||||||
slot = shortLog(slot),
|
|
||||||
fromNow = shortLog(fromNow),
|
|
||||||
cat = "scheduling"
|
|
||||||
|
|
||||||
await sleepAsync(fromNow)
|
# Time passed - we might need to select a new head in that case
|
||||||
|
head = node.updateHead(slot)
|
||||||
|
|
||||||
# Time passed - we might need to select a new head in that case
|
handleAttestations(node, head, slot)
|
||||||
head = node.updateHead(slot)
|
|
||||||
|
|
||||||
handleAttestations(node, head, slot)
|
|
||||||
|
|
||||||
# TODO ... and beacon clock might jump here also. sigh.
|
# TODO ... and beacon clock might jump here also. sigh.
|
||||||
let
|
let
|
||||||
|
@ -764,12 +799,28 @@ proc onSlotStart(node: BeaconNode, lastSlot, scheduledSlot: Slot) {.gcsafe, asyn
|
||||||
addTimer(nextSlotStart) do (p: pointer):
|
addTimer(nextSlotStart) do (p: pointer):
|
||||||
asyncCheck node.onSlotStart(slot, nextSlot)
|
asyncCheck node.onSlotStart(slot, nextSlot)
|
||||||
|
|
||||||
proc onSecond(node: BeaconNode, moment: Moment) {.async.} =
|
proc handleMissingBlocks(node: BeaconNode) =
|
||||||
let missingBlocks = node.blockPool.checkMissing()
|
let missingBlocks = node.blockPool.checkMissing()
|
||||||
if missingBlocks.len > 0:
|
if missingBlocks.len > 0:
|
||||||
|
var left = missingBlocks.len
|
||||||
|
|
||||||
info "Requesting detected missing blocks", missingBlocks
|
info "Requesting detected missing blocks", missingBlocks
|
||||||
node.requestManager.fetchAncestorBlocks(missingBlocks) do (b: BeaconBlock):
|
node.requestManager.fetchAncestorBlocks(missingBlocks) do (b: BeaconBlock):
|
||||||
onBeaconBlock(node ,b)
|
onBeaconBlock(node, b)
|
||||||
|
|
||||||
|
# TODO instead of waiting for a full second to try the next missing block
|
||||||
|
# fetching, we'll do it here again in case we get all blocks we asked
|
||||||
|
# for (there might be new parents to fetch). of course, this is not
|
||||||
|
# good because the onSecond fetching also kicks in regardless but
|
||||||
|
# whatever - this is just a quick fix for making the testnet easier
|
||||||
|
# work with while the sync problem is dealt with more systematically
|
||||||
|
dec left
|
||||||
|
if left == 0:
|
||||||
|
addTimer(Moment.now()) do (p: pointer):
|
||||||
|
handleMissingBlocks(node)
|
||||||
|
|
||||||
|
proc onSecond(node: BeaconNode, moment: Moment) {.async.} =
|
||||||
|
node.handleMissingBlocks()
|
||||||
|
|
||||||
let nextSecond = max(Moment.now(), moment + chronos.seconds(1))
|
let nextSecond = max(Moment.now(), moment + chronos.seconds(1))
|
||||||
addTimer(nextSecond) do (p: pointer):
|
addTimer(nextSecond) do (p: pointer):
|
||||||
|
|
Loading…
Reference in New Issue