avoid producing blocks/attestations when out of sync

this is a temporary measure until we figure something better out - as it
stands, we'll advance with empty slots and crash because all validators
are out.
This commit is contained in:
Jacek Sieka 2019-12-02 15:42:57 +01:00 committed by Dustin Brody
parent fd55096ec3
commit 90212eed2a
1 changed files with 106 additions and 55 deletions

View File

@ -19,6 +19,7 @@ const
dataDirValidators = "validators" dataDirValidators = "validators"
genesisFile = "genesis.ssz" genesisFile = "genesis.ssz"
hasPrompt = not defined(withoutPrompt) hasPrompt = not defined(withoutPrompt)
maxEmptySlotCount = uint64(24*60*60) div SECONDS_PER_SLOT
# https://github.com/ethereum/eth2.0-metrics/blob/master/metrics.md#interop-metrics # https://github.com/ethereum/eth2.0-metrics/blob/master/metrics.md#interop-metrics
declareGauge beacon_slot, declareGauge beacon_slot,
@ -297,6 +298,33 @@ func getAttachedValidator(node: BeaconNode,
let validatorKey = state.validators[idx].pubkey let validatorKey = state.validators[idx].pubkey
node.attachedValidators.getValidator(validatorKey) node.attachedValidators.getValidator(validatorKey)
proc isSynced(node: BeaconNode, head: BlockRef): bool =
## TODO This function is here as a placeholder for some better heurestics to
## determine if we're in sync and should be producing blocks and
## attestations. Generally, the problem is that slot time keeps advancing
## even when there are no blocks being produced, so there's no way to
## distinguish validators geniunely going missing from the node not being
## well connected (during a network split or an internet outage for
## example). It would generally be correct to simply keep running as if
## we were the only legit node left alive, but then we run into issues:
## with enough many empty slots, the validator pool is emptied leading
## to empty committees and lots of empty slot processing that will be
## thrown away as soon as we're synced again.
let
# The slot we should be at, according to the clock
beaconTime = node.beaconClock.now()
wallSlot = beaconTime.toSlot()
# TODO if everyone follows this logic, the network will not recover from a
# halt: nobody will be producing blocks because everone expects someone
# else to do it
if wallSlot.afterGenesis and (wallSlot.slot > head.slot) and
(wallSlot.slot - head.slot) > maxEmptySlotCount:
false
else:
true
proc updateHead(node: BeaconNode, slot: Slot): BlockRef = proc updateHead(node: BeaconNode, slot: Slot): BlockRef =
# Use head state for attestation resolution below # Use head state for attestation resolution below
@ -484,11 +512,16 @@ proc onAttestation(node: BeaconNode, attestation: Attestation) =
# though - maybe we should use the state from the block pointed to by # though - maybe we should use the state from the block pointed to by
# the attestation for some of the check? Consider interop with block # the attestation for some of the check? Consider interop with block
# production! # production!
let if attestation.data.slot > head.blck.slot and
bs = BlockSlot(blck: head.blck, slot: wallSlot.slot) (attestation.data.slot - head.blck.slot) > maxEmptySlotCount:
warn "Ignoring attestation, head block too old (out of sync?)",
attestationSlot = attestation.data.slot, headSlot = head.blck.slot
else:
let
bs = BlockSlot(blck: head.blck, slot: wallSlot.slot)
node.blockPool.withState(node.stateCache, bs): node.blockPool.withState(node.stateCache, bs):
node.attestationPool.add(state, attestedBlock, attestation) node.attestationPool.add(state, attestedBlock, attestation)
else: else:
node.attestationPool.addUnresolved(attestation) node.attestationPool.addUnresolved(attestation)
@ -697,65 +730,67 @@ proc onSlotStart(node: BeaconNode, lastSlot, scheduledSlot: Slot) {.gcsafe, asyn
# disappear naturally - risky because user is not aware, # disappear naturally - risky because user is not aware,
# and might lose stake on canonical chain but "just works" # and might lose stake on canonical chain but "just works"
# when reconnected.. # when reconnected..
# Right now, we keep going if not node.isSynced(head):
warn "Node out of sync, skipping block and attestation production for this slot",
slot, headSlot = head.slot
else:
var curSlot = lastSlot + 1
while curSlot < slot:
# Timers may be delayed because we're busy processing, and we might have
# more work to do. We'll try to do so in an expedited way.
# TODO maybe even collect all work synchronously to avoid unnecessary
# state rewinds while waiting for async operations like validator
# signature..
notice "Catching up",
curSlot = shortLog(curSlot),
lastSlot = shortLog(lastSlot),
slot = shortLog(slot),
cat = "overload"
var curSlot = lastSlot + 1 # For every slot we're catching up, we'll propose then send
while curSlot < slot: # attestations - head should normally be advancing along the same branch
# Timers may be delayed because we're busy processing, and we might have # in this case
# more work to do. We'll try to do so in an expedited way. # TODO what if we receive blocks / attestations while doing this work?
# TODO maybe even collect all work synchronously to avoid unnecessary head = await handleProposal(node, head, curSlot)
# state rewinds while waiting for async operations like validator
# signature..
notice "Catching up",
curSlot = shortLog(curSlot),
lastSlot = shortLog(lastSlot),
slot = shortLog(slot),
cat = "overload"
# For every slot we're catching up, we'll propose then send # For each slot we missed, we need to send out attestations - if we were
# attestations - head should normally be advancing along the same branch # proposing during this time, we'll use the newly proposed head, else just
# in this case # keep reusing the same - the attestation that goes out will actually
# TODO what if we receive blocks / attestations while doing this work? # rewind the state to what it looked like at the time of that slot
head = await handleProposal(node, head, curSlot) # TODO smells like there's an optimization opportunity here
handleAttestations(node, head, curSlot)
# For each slot we missed, we need to send out attestations - if we were curSlot += 1
# proposing during this time, we'll use the newly proposed head, else just
# keep reusing the same - the attestation that goes out will actually
# rewind the state to what it looked like at the time of that slot
# TODO smells like there's an optimization opportunity here
handleAttestations(node, head, curSlot)
curSlot += 1 head = await handleProposal(node, head, slot)
head = await handleProposal(node, head, slot) # We've been doing lots of work up until now which took time. Normally, we
# send out attestations at the slot mid-point, so we go back to the clock
# to see how much time we need to wait.
# TODO the beacon clock might jump here also. It's probably easier to complete
# the work for the whole slot using a monotonic clock instead, then deal
# with any clock discrepancies once only, at the start of slot timer
# processing..
let
attestationStart = node.beaconClock.fromNow(slot)
halfSlot = seconds(int64(SECONDS_PER_SLOT div 2))
# We've been doing lots of work up until now which took time. Normally, we if attestationStart.inFuture or attestationStart.offset <= halfSlot:
# send out attestations at the slot mid-point, so we go back to the clock let fromNow =
# to see how much time we need to wait. if attestationStart.inFuture: attestationStart.offset + halfSlot
# TODO the beacon clock might jump here also. It's probably easier to complete else: halfSlot - attestationStart.offset
# the work for the whole slot using a monotonic clock instead, then deal
# with any clock discrepancies once only, at the start of slot timer
# processing..
let
attestationStart = node.beaconClock.fromNow(slot)
halfSlot = seconds(int64(SECONDS_PER_SLOT div 2))
if attestationStart.inFuture or attestationStart.offset <= halfSlot: trace "Waiting to send attestations",
let fromNow = slot = shortLog(slot),
if attestationStart.inFuture: attestationStart.offset + halfSlot fromNow = shortLog(fromNow),
else: halfSlot - attestationStart.offset cat = "scheduling"
trace "Waiting to send attestations", await sleepAsync(fromNow)
slot = shortLog(slot),
fromNow = shortLog(fromNow),
cat = "scheduling"
await sleepAsync(fromNow) # Time passed - we might need to select a new head in that case
head = node.updateHead(slot)
# Time passed - we might need to select a new head in that case handleAttestations(node, head, slot)
head = node.updateHead(slot)
handleAttestations(node, head, slot)
# TODO ... and beacon clock might jump here also. sigh. # TODO ... and beacon clock might jump here also. sigh.
let let
@ -764,12 +799,28 @@ proc onSlotStart(node: BeaconNode, lastSlot, scheduledSlot: Slot) {.gcsafe, asyn
addTimer(nextSlotStart) do (p: pointer): addTimer(nextSlotStart) do (p: pointer):
asyncCheck node.onSlotStart(slot, nextSlot) asyncCheck node.onSlotStart(slot, nextSlot)
proc onSecond(node: BeaconNode, moment: Moment) {.async.} = proc handleMissingBlocks(node: BeaconNode) =
let missingBlocks = node.blockPool.checkMissing() let missingBlocks = node.blockPool.checkMissing()
if missingBlocks.len > 0: if missingBlocks.len > 0:
var left = missingBlocks.len
info "Requesting detected missing blocks", missingBlocks info "Requesting detected missing blocks", missingBlocks
node.requestManager.fetchAncestorBlocks(missingBlocks) do (b: BeaconBlock): node.requestManager.fetchAncestorBlocks(missingBlocks) do (b: BeaconBlock):
onBeaconBlock(node ,b) onBeaconBlock(node, b)
# TODO instead of waiting for a full second to try the next missing block
# fetching, we'll do it here again in case we get all blocks we asked
# for (there might be new parents to fetch). of course, this is not
# good because the onSecond fetching also kicks in regardless but
# whatever - this is just a quick fix for making the testnet easier
# work with while the sync problem is dealt with more systematically
dec left
if left == 0:
addTimer(Moment.now()) do (p: pointer):
handleMissingBlocks(node)
proc onSecond(node: BeaconNode, moment: Moment) {.async.} =
node.handleMissingBlocks()
let nextSecond = max(Moment.now(), moment + chronos.seconds(1)) let nextSecond = max(Moment.now(), moment + chronos.seconds(1))
addTimer(nextSecond) do (p: pointer): addTimer(nextSecond) do (p: pointer):