avoid producing blocks/attestations when out of sync

this is a temporary measure until we figure something better out - as it
stands, we'll advance with empty slots and crash because all validators
are out.
This commit is contained in:
Jacek Sieka 2019-12-02 15:42:57 +01:00 committed by Dustin Brody
parent fd55096ec3
commit 90212eed2a
1 changed files with 106 additions and 55 deletions

View File

@ -19,6 +19,7 @@ const
dataDirValidators = "validators"
genesisFile = "genesis.ssz"
hasPrompt = not defined(withoutPrompt)
maxEmptySlotCount = uint64(24*60*60) div SECONDS_PER_SLOT
# https://github.com/ethereum/eth2.0-metrics/blob/master/metrics.md#interop-metrics
declareGauge beacon_slot,
@ -297,6 +298,33 @@ func getAttachedValidator(node: BeaconNode,
let validatorKey = state.validators[idx].pubkey
node.attachedValidators.getValidator(validatorKey)
proc isSynced(node: BeaconNode, head: BlockRef): bool =
## TODO This function is here as a placeholder for some better heurestics to
## determine if we're in sync and should be producing blocks and
## attestations. Generally, the problem is that slot time keeps advancing
## even when there are no blocks being produced, so there's no way to
## distinguish validators geniunely going missing from the node not being
## well connected (during a network split or an internet outage for
## example). It would generally be correct to simply keep running as if
## we were the only legit node left alive, but then we run into issues:
## with enough many empty slots, the validator pool is emptied leading
## to empty committees and lots of empty slot processing that will be
## thrown away as soon as we're synced again.
let
# The slot we should be at, according to the clock
beaconTime = node.beaconClock.now()
wallSlot = beaconTime.toSlot()
# TODO if everyone follows this logic, the network will not recover from a
# halt: nobody will be producing blocks because everone expects someone
# else to do it
if wallSlot.afterGenesis and (wallSlot.slot > head.slot) and
(wallSlot.slot - head.slot) > maxEmptySlotCount:
false
else:
true
proc updateHead(node: BeaconNode, slot: Slot): BlockRef =
# Use head state for attestation resolution below
@ -484,11 +512,16 @@ proc onAttestation(node: BeaconNode, attestation: Attestation) =
# though - maybe we should use the state from the block pointed to by
# the attestation for some of the check? Consider interop with block
# production!
let
bs = BlockSlot(blck: head.blck, slot: wallSlot.slot)
if attestation.data.slot > head.blck.slot and
(attestation.data.slot - head.blck.slot) > maxEmptySlotCount:
warn "Ignoring attestation, head block too old (out of sync?)",
attestationSlot = attestation.data.slot, headSlot = head.blck.slot
else:
let
bs = BlockSlot(blck: head.blck, slot: wallSlot.slot)
node.blockPool.withState(node.stateCache, bs):
node.attestationPool.add(state, attestedBlock, attestation)
node.blockPool.withState(node.stateCache, bs):
node.attestationPool.add(state, attestedBlock, attestation)
else:
node.attestationPool.addUnresolved(attestation)
@ -697,65 +730,67 @@ proc onSlotStart(node: BeaconNode, lastSlot, scheduledSlot: Slot) {.gcsafe, asyn
# disappear naturally - risky because user is not aware,
# and might lose stake on canonical chain but "just works"
# when reconnected..
# Right now, we keep going
if not node.isSynced(head):
warn "Node out of sync, skipping block and attestation production for this slot",
slot, headSlot = head.slot
else:
var curSlot = lastSlot + 1
while curSlot < slot:
# Timers may be delayed because we're busy processing, and we might have
# more work to do. We'll try to do so in an expedited way.
# TODO maybe even collect all work synchronously to avoid unnecessary
# state rewinds while waiting for async operations like validator
# signature..
notice "Catching up",
curSlot = shortLog(curSlot),
lastSlot = shortLog(lastSlot),
slot = shortLog(slot),
cat = "overload"
var curSlot = lastSlot + 1
while curSlot < slot:
# Timers may be delayed because we're busy processing, and we might have
# more work to do. We'll try to do so in an expedited way.
# TODO maybe even collect all work synchronously to avoid unnecessary
# state rewinds while waiting for async operations like validator
# signature..
notice "Catching up",
curSlot = shortLog(curSlot),
lastSlot = shortLog(lastSlot),
slot = shortLog(slot),
cat = "overload"
# For every slot we're catching up, we'll propose then send
# attestations - head should normally be advancing along the same branch
# in this case
# TODO what if we receive blocks / attestations while doing this work?
head = await handleProposal(node, head, curSlot)
# For every slot we're catching up, we'll propose then send
# attestations - head should normally be advancing along the same branch
# in this case
# TODO what if we receive blocks / attestations while doing this work?
head = await handleProposal(node, head, curSlot)
# For each slot we missed, we need to send out attestations - if we were
# proposing during this time, we'll use the newly proposed head, else just
# keep reusing the same - the attestation that goes out will actually
# rewind the state to what it looked like at the time of that slot
# TODO smells like there's an optimization opportunity here
handleAttestations(node, head, curSlot)
# For each slot we missed, we need to send out attestations - if we were
# proposing during this time, we'll use the newly proposed head, else just
# keep reusing the same - the attestation that goes out will actually
# rewind the state to what it looked like at the time of that slot
# TODO smells like there's an optimization opportunity here
handleAttestations(node, head, curSlot)
curSlot += 1
curSlot += 1
head = await handleProposal(node, head, slot)
head = await handleProposal(node, head, slot)
# We've been doing lots of work up until now which took time. Normally, we
# send out attestations at the slot mid-point, so we go back to the clock
# to see how much time we need to wait.
# TODO the beacon clock might jump here also. It's probably easier to complete
# the work for the whole slot using a monotonic clock instead, then deal
# with any clock discrepancies once only, at the start of slot timer
# processing..
let
attestationStart = node.beaconClock.fromNow(slot)
halfSlot = seconds(int64(SECONDS_PER_SLOT div 2))
# We've been doing lots of work up until now which took time. Normally, we
# send out attestations at the slot mid-point, so we go back to the clock
# to see how much time we need to wait.
# TODO the beacon clock might jump here also. It's probably easier to complete
# the work for the whole slot using a monotonic clock instead, then deal
# with any clock discrepancies once only, at the start of slot timer
# processing..
let
attestationStart = node.beaconClock.fromNow(slot)
halfSlot = seconds(int64(SECONDS_PER_SLOT div 2))
if attestationStart.inFuture or attestationStart.offset <= halfSlot:
let fromNow =
if attestationStart.inFuture: attestationStart.offset + halfSlot
else: halfSlot - attestationStart.offset
if attestationStart.inFuture or attestationStart.offset <= halfSlot:
let fromNow =
if attestationStart.inFuture: attestationStart.offset + halfSlot
else: halfSlot - attestationStart.offset
trace "Waiting to send attestations",
slot = shortLog(slot),
fromNow = shortLog(fromNow),
cat = "scheduling"
trace "Waiting to send attestations",
slot = shortLog(slot),
fromNow = shortLog(fromNow),
cat = "scheduling"
await sleepAsync(fromNow)
await sleepAsync(fromNow)
# Time passed - we might need to select a new head in that case
head = node.updateHead(slot)
# Time passed - we might need to select a new head in that case
head = node.updateHead(slot)
handleAttestations(node, head, slot)
handleAttestations(node, head, slot)
# TODO ... and beacon clock might jump here also. sigh.
let
@ -764,12 +799,28 @@ proc onSlotStart(node: BeaconNode, lastSlot, scheduledSlot: Slot) {.gcsafe, asyn
addTimer(nextSlotStart) do (p: pointer):
asyncCheck node.onSlotStart(slot, nextSlot)
proc onSecond(node: BeaconNode, moment: Moment) {.async.} =
proc handleMissingBlocks(node: BeaconNode) =
let missingBlocks = node.blockPool.checkMissing()
if missingBlocks.len > 0:
var left = missingBlocks.len
info "Requesting detected missing blocks", missingBlocks
node.requestManager.fetchAncestorBlocks(missingBlocks) do (b: BeaconBlock):
onBeaconBlock(node ,b)
onBeaconBlock(node, b)
# TODO instead of waiting for a full second to try the next missing block
# fetching, we'll do it here again in case we get all blocks we asked
# for (there might be new parents to fetch). of course, this is not
# good because the onSecond fetching also kicks in regardless but
# whatever - this is just a quick fix for making the testnet easier
# work with while the sync problem is dealt with more systematically
dec left
if left == 0:
addTimer(Moment.now()) do (p: pointer):
handleMissingBlocks(node)
proc onSecond(node: BeaconNode, moment: Moment) {.async.} =
node.handleMissingBlocks()
let nextSecond = max(Moment.now(), moment + chronos.seconds(1))
addTimer(nextSecond) do (p: pointer):