From c014f0b3018bba51edc5ff163145e1a5e847c1c0 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Tue, 19 May 2020 16:18:07 +0200 Subject: [PATCH] Split quarantine (#1038) * split blockpool into hotDB and Quarantine * Rename hotdb -> dag/candidate chains --- beacon_chain/beacon_node_types.nim | 148 +- beacon_chain/block_pool.nim | 1186 ++--------------- .../block_pools/block_pools_types.nim | 168 +++ beacon_chain/block_pools/candidate_chains.nim | 841 ++++++++++++ beacon_chain/block_pools/clearance.nim | 351 +++++ beacon_chain/block_pools/quarantine.nim | 38 + 6 files changed, 1492 insertions(+), 1240 deletions(-) create mode 100644 beacon_chain/block_pools/block_pools_types.nim create mode 100644 beacon_chain/block_pools/candidate_chains.nim create mode 100644 beacon_chain/block_pools/clearance.nim create mode 100644 beacon_chain/block_pools/quarantine.nim diff --git a/beacon_chain/beacon_node_types.nim b/beacon_chain/beacon_node_types.nim index 12e70dd07..bcb6d685b 100644 --- a/beacon_chain/beacon_node_types.nim +++ b/beacon_chain/beacon_node_types.nim @@ -4,7 +4,11 @@ import deques, tables, stew/[endians2, byteutils], chronicles, spec/[datatypes, crypto, digest], - beacon_chain_db, extras + beacon_chain_db, extras, + block_pools/block_pools_types, + block_pool # TODO: refactoring compat shim + +export block_pools_types type # ############################################# @@ -72,132 +76,6 @@ type ## Map that keeps track of the most recent vote of each attester - see ## fork_choice - # ############################################# - # - # Block Pool - # - # ############################################# - BlockPool* = ref object - ## Pool of blocks responsible for keeping a graph of resolved blocks as well - ## as candidates that may yet become part of that graph. - ## Currently, this type works as a facade to the BeaconChainDB, making - ## assumptions about the block composition therein. - ## - ## The general idea here is that blocks known to us are divided into two - ## camps - unresolved and resolved. When we start the chain, we have a - ## genesis state that serves as the root of the graph we're interested in. - ## Every block that belongs to that chain will have a path to that block - - ## conversely, blocks that do not are not interesting to us. - ## - ## As the chain progresses, some states become finalized as part of the - ## consensus process. One way to think of that is that the blocks that - ## come before them are no longer relevant, and the finalized state - ## is the new genesis from which we build. Thus, instead of tracing a path - ## to genesis, we can trace a path to any finalized block that follows - we - ## call the oldest such block a tail block. - ## - ## It's important to note that blocks may arrive in any order due to - ## chainging network conditions - we counter this by buffering unresolved - ## blocks for some time while trying to establish a path. - ## - ## Once a path is established, the block becomes resolved. We store the - ## graph in memory, in the form of BlockRef objects. This is also when - ## we forward the block for storage in the database - ## - ## TODO evaluate the split of responsibilities between the two - ## TODO prune the graph as tail moves - - pending*: Table[Eth2Digest, SignedBeaconBlock] ##\ - ## Blocks that have passed validation but that we lack a link back to tail - ## for - when we receive a "missing link", we can use this data to build - ## an entire branch - - missing*: Table[Eth2Digest, MissingBlock] ##\ - ## Roots of blocks that we would like to have (either parent_root of - ## unresolved blocks or block roots of attestations) - - blocks*: Table[Eth2Digest, BlockRef] ##\ - ## Tree of blocks pointing back to a finalized block on the chain we're - ## interested in - we call that block the tail - - tail*: BlockRef ##\ - ## The earliest finalized block we know about - - head*: Head ##\ - ## The latest block we know about, that's been chosen as a head by the fork - ## choice rule - - finalizedHead*: BlockSlot ##\ - ## The latest block that was finalized according to the block in head - ## Ancestors of this block are guaranteed to have 1 child only. - - db*: BeaconChainDB - - cachedStates*: seq[tuple[blockRoot: Eth2Digest, slot: Slot, - state: ref HashedBeaconState]] - - heads*: seq[Head] - - inAdd*: bool - - headState*: StateData ##\ - ## State given by the head block; only update in `updateHead`, not anywhere - ## else via `withState` - - justifiedState*: StateData ## Latest justified state, as seen from the head - - tmpState*: StateData ## Scratchpad - may be any state - - updateFlags*: UpdateFlags - - MissingBlock* = object - slots*: uint64 # number of slots that are suspected missing - tries*: int - - BlockRef* = ref object - ## Node in object graph guaranteed to lead back to tail block, and to have - ## a corresponding entry in database. - ## Block graph should form a tree - in particular, there are no cycles. - - root*: Eth2Digest ##\ - ## Root that can be used to retrieve block data from database - - parent*: BlockRef ##\ - ## Not nil, except for the tail - - children*: seq[BlockRef] - # TODO do we strictly need this? - - slot*: Slot # TODO could calculate this by walking to root, but.. - - BlockData* = object - ## Body and graph in one - - data*: SignedBeaconBlock - refs*: BlockRef - - StateData* = object - data*: HashedBeaconState - - blck*: BlockRef ##\ - ## The block associated with the state found in data - normally - ## `blck.state_root == data.root` but the state might have been advanced - ## further with empty slots invalidating this condition. - - BlockSlot* = object - ## Unique identifier for a particular fork and time in the block chain - - ## normally, there's a block for every slot, but in the case a block is not - ## produced, the chain progresses anyway, producing a new state for every - ## slot. - blck*: BlockRef - slot*: Slot ##\ - ## Slot time for this BlockSlot which may differ from blck.slot when time - ## has advanced without blocks - - Head* = object - blck*: BlockRef - justified*: BlockSlot - # ############################################# # # Validator Pool @@ -221,20 +99,4 @@ type ValidatorPool* = object validators*: Table[ValidatorPubKey, AttachedValidator] - FetchRecord* = object - root*: Eth2Digest - historySlots*: uint64 - proc shortLog*(v: AttachedValidator): string = shortLog(v.pubKey) - -proc shortLog*(v: BlockSlot): string = - if v.blck.slot == v.slot: - v.blck.root.data[0..3].toHex() & ":" & $v.blck.slot - else: # There was a gap - log it - v.blck.root.data[0..3].toHex() & ":" & $v.blck.slot & "@" & $v.slot - -proc shortLog*(v: BlockRef): string = - v.root.data[0..3].toHex() & ":" & $v.slot - -chronicles.formatIt BlockSlot: shortLog(it) -chronicles.formatIt BlockRef: shortLog(it) diff --git a/beacon_chain/block_pool.nim b/beacon_chain/block_pool.nim index 342223c81..60e82b413 100644 --- a/beacon_chain/block_pool.nim +++ b/beacon_chain/block_pool.nim @@ -5,549 +5,71 @@ # * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. -{.push raises: [Defect].} +import + extras, beacon_chain_db, + spec/[crypto, datatypes, digest] + import - chronicles, options, tables, - stew/[bitops2, results], ssz, beacon_chain_db, state_transition, extras, eth/db/kvstore, - beacon_node_types, metrics, - spec/[crypto, datatypes, digest, helpers, validator] + block_pools/[block_pools_types, clearance, candidate_chains, quarantine] -declareCounter beacon_reorgs_total, "Total occurrences of reorganizations of the chain" # On fork choice -declareCounter beacon_state_data_cache_hits, "pool.cachedStates hits" -declareCounter beacon_state_data_cache_misses, "pool.cachedStates misses" +# Block_Pools +# -------------------------------------------- +# +# Compatibility shims to minimize PR breakage +# during block_pool refactor -logScope: topics = "blkpool" +type + BlockPools* = object + # TODO: Rename BlockPools + quarantine: Quarantine + dag: CandidateChains -proc updateStateData*( - pool: BlockPool, state: var StateData, bs: BlockSlot) {.gcsafe.} -proc add*( - pool: var BlockPool, blockRoot: Eth2Digest, - signedBlock: SignedBeaconBlock): BlockRef {.gcsafe.} + BlockPool* = BlockPools -template withState*( - pool: BlockPool, cache: var StateData, blockSlot: BlockSlot, body: untyped): untyped = - ## Helper template that updates state to a particular BlockSlot - usage of - ## cache is unsafe outside of block. - ## TODO async transformations will lead to a race where cache gets updated - ## while waiting for future to complete - catch this here somehow? +{.push raises: [Defect], inline.} - updateStateData(pool, cache, blockSlot) +# Quarantine dispatch +# -------------------------------------------- - template hashedState(): HashedBeaconState {.inject, used.} = cache.data - template state(): BeaconState {.inject, used.} = cache.data.data - template blck(): BlockRef {.inject, used.} = cache.blck - template root(): Eth2Digest {.inject, used.} = cache.data.root +func checkMissing*(pool: var BlockPool): seq[FetchRecord] {.noInit.} = + checkMissing(pool.quarantine) - body +# CandidateChains +# -------------------------------------------- -func parent*(bs: BlockSlot): BlockSlot = - ## Return a blockslot representing the previous slot, using the parent block - ## if the current slot had a block - if bs.slot == Slot(0): - BlockSlot(blck: nil, slot: Slot(0)) - else: - BlockSlot( - blck: if bs.slot > bs.blck.slot: bs.blck else: bs.blck.parent, - slot: bs.slot - 1 - ) +template tail*(pool: BlockPool): BlockRef = + pool.dag.tail -func link(parent, child: BlockRef) = - doAssert (not (parent.root == Eth2Digest() or child.root == Eth2Digest())), - "blocks missing root!" - doAssert parent.root != child.root, "self-references not allowed" +template heads*(pool: BlockPool): seq[Head] = + pool.dag.heads - child.parent = parent - parent.children.add(child) +template head*(pool: BlockPool): Head = + pool.dag.head -func isAncestorOf*(a, b: BlockRef): bool = - var b = b - var depth = 0 - const maxDepth = (100'i64 * 365 * 24 * 60 * 60 div SECONDS_PER_SLOT.int) - while true: - if a == b: return true +template finalizedHead*(pool: BlockPool): BlockSlot = + pool.dag.finalizedHead - # for now, use an assert for block chain length since a chain this long - # indicates a circular reference here.. - doAssert depth < maxDepth - depth += 1 +proc add*(pool: var BlockPool, blockRoot: Eth2Digest, + signedBlock: SignedBeaconBlock): BlockRef {.gcsafe.} = + add(pool.dag, pool.quarantine, blockRoot, signedBlock) - if a.slot >= b.slot or b.parent.isNil: - return false +export parent # func parent*(bs: BlockSlot): BlockSlot +export isAncestorOf # func isAncestorOf*(a, b: BlockRef): bool +export getAncestorAt # func isAncestorOf*(a, b: BlockRef): bool +export get_ancestor # func get_ancestor*(blck: BlockRef, slot: Slot): BlockRef +export atSlot # func atSlot*(blck: BlockRef, slot: Slot): BlockSlot - doAssert b.slot > b.parent.slot - b = b.parent -func getAncestorAt*(blck: BlockRef, slot: Slot): BlockRef = - ## Return the most recent block as of the time at `slot` that not more recent - ## than `blck` itself +proc init*(T: type BlockPools, db: BeaconChainDB, + updateFlags: UpdateFlags = {}): BlockPools = + result.dag = init(CandidateChains, db, updateFlags) - var blck = blck - - var depth = 0 - const maxDepth = (100'i64 * 365 * 24 * 60 * 60 div SECONDS_PER_SLOT.int) - - while true: - if blck.slot <= slot: - return blck - - if blck.parent.isNil: - return nil - - doAssert depth < maxDepth - depth += 1 - - blck = blck.parent - -func get_ancestor*(blck: BlockRef, slot: Slot): BlockRef = - ## https://github.com/ethereum/eth2.0-specs/blob/v0.11.1/specs/phase0/fork-choice.md#get_ancestor - ## Return ancestor at slot, or nil if queried block is older - var blck = blck - - var depth = 0 - const maxDepth = (100'i64 * 365 * 24 * 60 * 60 div SECONDS_PER_SLOT.int) - - while true: - if blck.slot == slot: - return blck - - if blck.slot < slot: - return nil - - if blck.parent.isNil: - return nil - - doAssert depth < maxDepth - depth += 1 - - blck = blck.parent - -func atSlot*(blck: BlockRef, slot: Slot): BlockSlot = - ## Return a BlockSlot at a given slot, with the block set to the closest block - ## available. If slot comes from before the block, a suitable block ancestor - ## will be used, else blck is returned as if all slots after it were empty. - ## This helper is useful when imagining what the chain looked like at a - ## particular moment in time, or when imagining what it will look like in the - ## near future if nothing happens (such as when looking ahead for the next - ## block proposal) - BlockSlot(blck: blck.getAncestorAt(slot), slot: slot) - -func init*(T: type BlockRef, root: Eth2Digest, slot: Slot): BlockRef = - BlockRef( - root: root, - slot: slot - ) - -func init*(T: type BlockRef, root: Eth2Digest, blck: BeaconBlock): BlockRef = - BlockRef.init(root, blck.slot) - -proc init*(T: type BlockPool, db: BeaconChainDB, - updateFlags: UpdateFlags = {}): BlockPool = - # TODO we require that the db contains both a head and a tail block - - # asserting here doesn't seem like the right way to go about it however.. - - let - tailBlockRoot = db.getTailBlock() - headBlockRoot = db.getHeadBlock() - - doAssert tailBlockRoot.isSome(), "Missing tail block, database corrupt?" - doAssert headBlockRoot.isSome(), "Missing head block, database corrupt?" - - let - tailRoot = tailBlockRoot.get() - tailBlock = db.getBlock(tailRoot).get() - tailRef = BlockRef.init(tailRoot, tailBlock.message) - headRoot = headBlockRoot.get() - - var - blocks = {tailRef.root: tailRef}.toTable() - headRef: BlockRef - - if headRoot != tailRoot: - var curRef: BlockRef - - for root, blck in db.getAncestors(headRoot): - if root == tailRef.root: - doAssert(not curRef.isNil) - link(tailRef, curRef) - curRef = curRef.parent - break - - let newRef = BlockRef.init(root, blck.message) - if curRef == nil: - curRef = newRef - headRef = newRef - else: - link(newRef, curRef) - curRef = curRef.parent - blocks[curRef.root] = curRef - trace "Populating block pool", key = curRef.root, val = curRef - - doAssert curRef == tailRef, - "head block does not lead to tail, database corrupt?" - else: - headRef = tailRef - - var - bs = headRef.atSlot(headRef.slot) - tmpState = (ref StateData)() - - # Now that we have a head block, we need to find the most recent state that - # we have saved in the database - while bs.blck != nil: - let root = db.getStateRoot(bs.blck.root, bs.slot) - if root.isSome(): - # TODO load StateData from BeaconChainDB - # We save state root separately for empty slots which means we might - # sometimes not find a state even though we saved its state root - if db.getState(root.get(), tmpState.data.data, noRollback): - tmpState.data.root = root.get() - tmpState.blck = bs.blck - - break - - bs = bs.parent() # Iterate slot by slot in case there's a gap! - - if tmpState.blck == nil: - warn "No state found in head history, database corrupt?" - # TODO Potentially we could recover from here instead of crashing - what - # would be a good recovery model? - raiseAssert "No state found in head history, database corrupt?" - - # We presently save states on the epoch boundary - it means that the latest - # state we loaded might be older than head block - nonetheless, it will be - # from the same epoch as the head, thus the finalized and justified slots are - # the same - these only change on epoch boundaries. - let - finalizedSlot = - tmpState.data.data.finalized_checkpoint.epoch.compute_start_slot_at_epoch() - finalizedHead = headRef.atSlot(finalizedSlot) - justifiedSlot = - tmpState.data.data.current_justified_checkpoint.epoch.compute_start_slot_at_epoch() - justifiedHead = headRef.atSlot(justifiedSlot) - head = Head(blck: headRef, justified: justifiedHead) - - doAssert justifiedHead.slot >= finalizedHead.slot, - "justified head comes before finalized head - database corrupt?" - - let res = BlockPool( - pending: initTable[Eth2Digest, SignedBeaconBlock](), - missing: initTable[Eth2Digest, MissingBlock](), - blocks: blocks, - tail: tailRef, - head: head, - finalizedHead: finalizedHead, - db: db, - heads: @[head], - headState: tmpState[], - justifiedState: tmpState[], # This is wrong but we'll update it below - tmpState: tmpState[], - - # The only allowed flag right now is verifyFinalization, as the others all - # allow skipping some validation. - updateFlags: {verifyFinalization} * updateFlags - ) - - doAssert res.updateFlags in [{}, {verifyFinalization}] - - res.updateStateData(res.justifiedState, justifiedHead) - res.updateStateData(res.headState, headRef.atSlot(headRef.slot)) - - info "Block pool initialized", - head = head.blck, justifiedHead, finalizedHead, tail = tailRef, - totalBlocks = blocks.len - - res - -proc addResolvedBlock( - pool: var BlockPool, state: BeaconState, blockRoot: Eth2Digest, - signedBlock: SignedBeaconBlock, parent: BlockRef): BlockRef = - logScope: pcs = "block_resolution" - doAssert state.slot == signedBlock.message.slot, "state must match block" - - let blockRef = BlockRef.init(blockRoot, signedBlock.message) - link(parent, blockRef) - - pool.blocks[blockRoot] = blockRef - trace "Populating block pool", key = blockRoot, val = blockRef - - # Resolved blocks should be stored in database - pool.db.putBlock(blockRoot, signedBlock) - - # This block *might* have caused a justification - make sure we stow away - # that information: - let justifiedSlot = - state.current_justified_checkpoint.epoch.compute_start_slot_at_epoch() - - var foundHead: Option[Head] - for head in pool.heads.mitems(): - if head.blck.isAncestorOf(blockRef): - if head.justified.slot != justifiedSlot: - head.justified = blockRef.atSlot(justifiedSlot) - - head.blck = blockRef - - foundHead = some(head) - break - - if foundHead.isNone(): - foundHead = some(Head( - blck: blockRef, - justified: blockRef.atSlot(justifiedSlot))) - pool.heads.add(foundHead.get()) - - info "Block resolved", - blck = shortLog(signedBlock.message), - blockRoot = shortLog(blockRoot), - justifiedHead = foundHead.get().justified, - heads = pool.heads.len(), - cat = "filtering" - - # Now that we have the new block, we should see if any of the previously - # unresolved blocks magically become resolved - # TODO there are more efficient ways of doing this that don't risk - # running out of stack etc - # TODO This code is convoluted because when there are more than ~1.5k - # blocks being synced, there's a stack overflow as `add` gets called - # for the whole chain of blocks. Instead we use this ugly field in `pool` - # which could be avoided by refactoring the code - if not pool.inAdd: - pool.inAdd = true - defer: pool.inAdd = false - var keepGoing = true - while keepGoing: - let retries = pool.pending - for k, v in retries: - discard pool.add(k, v) - # Keep going for as long as the pending pool is shrinking - # TODO inefficient! so what? - keepGoing = pool.pending.len < retries.len - blockRef - -proc getState( - pool: BlockPool, db: BeaconChainDB, stateRoot: Eth2Digest, blck: BlockRef, - output: var StateData): bool = - let outputAddr = unsafeAddr output # local scope - func restore(v: var BeaconState) = - if outputAddr == (unsafeAddr pool.headState): - # TODO seeing the headState in the restore shouldn't happen - we load - # head states only when updating the head position, and by that time - # the database will have gone through enough sanity checks that - # SSZ exceptions shouldn't happen, which is when restore happens. - # Nonetheless, this is an ugly workaround that needs to go away - doAssert false, "Cannot alias headState" - - outputAddr[] = pool.headState - - if not db.getState(stateRoot, output.data.data, restore): - return false - - output.blck = blck - output.data.root = stateRoot - - true - -func getStateCacheIndex(pool: BlockPool, blockRoot: Eth2Digest, slot: Slot): int = - for i, cachedState in pool.cachedStates: - let (cacheBlockRoot, cacheSlot, state) = cachedState - if cacheBlockRoot == blockRoot and cacheSlot == slot: - return i - - -1 - -proc putState(pool: BlockPool, state: HashedBeaconState, blck: BlockRef) = - # TODO we save state at every epoch start but never remove them - we also - # potentially save multiple states per slot if reorgs happen, meaning - # we could easily see a state explosion - logScope: pcs = "save_state_at_epoch_start" - - var rootWritten = false - if state.data.slot != blck.slot: - # This is a state that was produced by a skip slot for which there is no - # block - we'll save the state root in the database in case we need to - # replay the skip - pool.db.putStateRoot(blck.root, state.data.slot, state.root) - rootWritten = true - - if state.data.slot.isEpoch: - if not pool.db.containsState(state.root): - info "Storing state", - blck = shortLog(blck), - stateSlot = shortLog(state.data.slot), - stateRoot = shortLog(state.root), - cat = "caching" - pool.db.putState(state.root, state.data) - if not rootWritten: - pool.db.putStateRoot(blck.root, state.data.slot, state.root) - - # Need to be able to efficiently access states for both attestation - # aggregation and to process block proposals going back to the last - # finalized slot. Ideally to avoid potential combinatiorial forking - # storage and/or memory constraints could CoW, up to and including, - # in particular, hash_tree_root() which is expensive to do 30 times - # since the previous epoch, to efficiently state_transition back to - # desired slot. However, none of that's in place, so there are both - # expensive, repeated BeaconState copies as well as computationally - # time-consuming-near-end-of-epoch hash tree roots. The latter are, - # effectively, naïvely O(n^2) in slot number otherwise, so when the - # slots become in the mid-to-high-20s it's spending all its time in - # pointlessly repeated calculations of prefix-state-transitions. An - # intermediate time/memory workaround involves storing only mapping - # between BlockRefs, or BlockSlots, and the BeaconState tree roots, - # but that still involves tens of megabytes worth of copying, along - # with the concomitant memory allocator and GC load. Instead, use a - # more memory-intensive (but more conceptually straightforward, and - # faster) strategy to just store, for the most recent slots. - let stateCacheIndex = pool.getStateCacheIndex(blck.root, state.data.slot) - if stateCacheIndex == -1: - # Could use a deque or similar, but want simpler structure, and the data - # items are small and few. - const MAX_CACHE_SIZE = 32 - insert(pool.cachedStates, (blck.root, state.data.slot, newClone(state))) - while pool.cachedStates.len > MAX_CACHE_SIZE: - discard pool.cachedStates.pop() - let cacheLen = pool.cachedStates.len - trace "BlockPool.putState(): state cache updated", cacheLen - doAssert cacheLen > 0 and cacheLen <= MAX_CACHE_SIZE - -proc add*( - pool: var BlockPool, blockRoot: Eth2Digest, - signedBlock: SignedBeaconBlock): BlockRef {.gcsafe.} = - ## return the block, if resolved... - ## the state parameter may be updated to include the given block, if - ## everything checks out - # TODO reevaluate passing the state in like this - let blck = signedBlock.message - doAssert blockRoot == hash_tree_root(blck) - - logScope: pcs = "block_addition" - - # Already seen this block?? - pool.blocks.withValue(blockRoot, blockRef): - debug "Block already exists", - blck = shortLog(blck), - blockRoot = shortLog(blockRoot), - cat = "filtering" - - return blockRef[] - - pool.missing.del(blockRoot) - - # If the block we get is older than what we finalized already, we drop it. - # One way this can happen is that we start resolving a block and finalization - # happens in the meantime - the block we requested will then be stale - # by the time it gets here. - if blck.slot <= pool.finalizedHead.slot: - debug "Old block, dropping", - blck = shortLog(blck), - finalizedHead = shortLog(pool.finalizedHead), - tail = shortLog(pool.tail), - blockRoot = shortLog(blockRoot), - cat = "filtering" - - return - - let parent = pool.blocks.getOrDefault(blck.parent_root) - - if parent != nil: - if parent.slot >= blck.slot: - # TODO Malicious block? inform peer pool? - notice "Invalid block slot", - blck = shortLog(blck), - blockRoot = shortLog(blockRoot), - parentBlock = shortLog(parent) - - return - - # The block might have been in either of pending or missing - we don't want - # any more work done on its behalf - pool.pending.del(blockRoot) - - # The block is resolved, now it's time to validate it to ensure that the - # blocks we add to the database are clean for the given state - - # TODO if the block is from the future, we should not be resolving it (yet), - # but maybe we should use it as a hint that our clock is wrong? - updateStateData( - pool, pool.tmpState, BlockSlot(blck: parent, slot: blck.slot - 1)) - - let - poolPtr = unsafeAddr pool # safe because restore is short-lived - func restore(v: var HashedBeaconState) = - # TODO address this ugly workaround - there should probably be a - # `state_transition` that takes a `StateData` instead and updates - # the block as well - doAssert v.addr == addr poolPtr.tmpState.data - poolPtr.tmpState = poolPtr.headState - - if not state_transition( - pool.tmpState.data, signedBlock, pool.updateFlags, restore): - # TODO find a better way to log all this block data - notice "Invalid block", - blck = shortLog(blck), - blockRoot = shortLog(blockRoot), - cat = "filtering" - - return - # Careful, tmpState.data has been updated but not blck - we need to create - # the BlockRef first! - pool.tmpState.blck = pool.addResolvedBlock( - pool.tmpState.data.data, blockRoot, signedBlock, parent) - pool.putState(pool.tmpState.data, pool.tmpState.blck) - - return pool.tmpState.blck - - # TODO already checked hash though? main reason to keep this is because - # the pending pool calls this function back later in a loop, so as long - # as pool.add(...) requires a SignedBeaconBlock, easier to keep them in - # pending too. - pool.pending[blockRoot] = signedBlock - - # TODO possibly, it makes sense to check the database - that would allow sync - # to simply fill up the database with random blocks the other clients - # think are useful - but, it would also risk filling the database with - # junk that's not part of the block graph - - if blck.parent_root in pool.missing or - blck.parent_root in pool.pending: - return - - # This is an unresolved block - put its parent on the missing list for now... - # TODO if we receive spam blocks, one heurestic to implement might be to wait - # for a couple of attestations to appear before fetching parents - this - # would help prevent using up network resources for spam - this serves - # two purposes: one is that attestations are likely to appear for the - # block only if it's valid / not spam - the other is that malicious - # validators that are not proposers can sign invalid blocks and send - # them out without penalty - but signing invalid attestations carries - # a risk of being slashed, making attestations a more valuable spam - # filter. - # TODO when we receive the block, we don't know how many others we're missing - # from that branch, so right now, we'll just do a blind guess - let parentSlot = blck.slot - 1 - - pool.missing[blck.parent_root] = MissingBlock( - slots: - # The block is at least two slots ahead - try to grab whole history - if parentSlot > pool.head.blck.slot: - parentSlot - pool.head.blck.slot - else: - # It's a sibling block from a branch that we're missing - fetch one - # epoch at a time - max(1.uint64, SLOTS_PER_EPOCH.uint64 - - (parentSlot.uint64 mod SLOTS_PER_EPOCH.uint64)) - ) - - debug "Unresolved block (parent missing)", - blck = shortLog(blck), - blockRoot = shortLog(blockRoot), - pending = pool.pending.len, - missing = pool.missing.len, - cat = "filtering" +export init # func init*(T: type BlockRef, root: Eth2Digest, blck: BeaconBlock): BlockRef func getRef*(pool: BlockPool, root: Eth2Digest): BlockRef = ## Retrieve a resolved block reference, if available - pool.blocks.getOrDefault(root, nil) + pool.dag.getRef(root) func getBlockRange*( pool: BlockPool, startSlot: Slot, skipStep: Natural, @@ -563,308 +85,30 @@ func getBlockRange*( ## at this index. ## ## If there were no blocks in the range, `output.len` will be returned. - let count = output.len - trace "getBlockRange entered", - head = shortLog(pool.head.blck.root), count, startSlot, skipStep - - let - skipStep = max(1, skipStep) # Treat 0 step as 1 - endSlot = startSlot + uint64(count * skipStep) - - var - b = pool.head.blck.atSlot(endSlot) - o = count - for i in 0.. 8: - done.add(k) - else: - inc v.tries - - for k in done: - # TODO Need to potentially remove from pool.pending - this is currently a - # memory leak here! - pool.missing.del(k) - - # simple (simplistic?) exponential backoff for retries.. - for k, v in pool.missing.pairs(): - if countOnes(v.tries.uint64) == 1: - result.add(FetchRecord(root: k, historySlots: v.slots)) - -proc skipAndUpdateState( - pool: BlockPool, - state: var HashedBeaconState, blck: BlockRef, slot: Slot, save: bool) = - while state.data.slot < slot: - # Process slots one at a time in case afterUpdate needs to see empty states - # TODO when replaying, we already do this query when loading the ancestors - - # save and reuse - # TODO possibly we should keep this in memory for the hot blocks - let nextStateRoot = pool.db.getStateRoot(blck.root, state.data.slot + 1) - advance_slot(state, nextStateRoot, pool.updateFlags) - - if save: - pool.putState(state, blck) - -proc skipAndUpdateState( - pool: BlockPool, - state: var StateData, blck: BlockData, flags: UpdateFlags, save: bool): bool = - - pool.skipAndUpdateState( - state.data, blck.refs, blck.data.message.slot - 1, save) - - var statePtr = unsafeAddr state # safe because `restore` is locally scoped - func restore(v: var HashedBeaconState) = - doAssert (addr(statePtr.data) == addr v) - statePtr[] = pool.headState - - let ok = state_transition( - state.data, blck.data, flags + pool.updateFlags, restore) - if ok and save: - pool.putState(state.data, blck.refs) - - ok - -proc rewindState(pool: BlockPool, state: var StateData, bs: BlockSlot): - seq[BlockData] = - logScope: pcs = "replay_state" - - var ancestors = @[pool.get(bs.blck)] - # Common case: the last block applied is the parent of the block to apply: - if not bs.blck.parent.isNil and state.blck.root == bs.blck.parent.root and - state.data.data.slot < bs.blck.slot: - return ancestors - - # It appears that the parent root of the proposed new block is different from - # what we expected. We will have to rewind the state to a point along the - # chain of ancestors of the new block. We will do this by loading each - # successive parent block and checking if we can find the corresponding state - # in the database. - var - stateRoot = block: - let tmp = pool.db.getStateRoot(bs.blck.root, bs.slot) - if tmp.isSome() and pool.db.containsState(tmp.get()): - tmp - else: - # State roots are sometimes kept in database even though state is not - err(Opt[Eth2Digest]) - curBs = bs - - while stateRoot.isNone(): - let parBs = curBs.parent() - if parBs.blck.isNil: - break # Bug probably! - - if parBs.blck != curBs.blck: - ancestors.add(pool.get(parBs.blck)) - - # TODO investigate replacing with getStateCached, by refactoring whole - # function. Empirically, this becomes pretty rare once good caches are - # used in the front-end. - let idx = pool.getStateCacheIndex(parBs.blck.root, parBs.slot) - if idx >= 0: - state.data = pool.cachedStates[idx].state[] - let ancestor = ancestors.pop() - state.blck = ancestor.refs - - beacon_state_data_cache_hits.inc() - trace "Replaying state transitions via in-memory cache", - stateSlot = shortLog(state.data.data.slot), - ancestorStateRoot = shortLog(ancestor.data.message.state_root), - ancestorStateSlot = shortLog(state.data.data.slot), - slot = shortLog(bs.slot), - blockRoot = shortLog(bs.blck.root), - ancestors = ancestors.len, - cat = "replay_state" - - return ancestors - - beacon_state_data_cache_misses.inc() - if (let tmp = pool.db.getStateRoot(parBs.blck.root, parBs.slot); tmp.isSome()): - if pool.db.containsState(tmp.get): - stateRoot = tmp - break - - curBs = parBs - - if stateRoot.isNone(): - # TODO this should only happen if the database is corrupt - we walked the - # list of parent blocks and couldn't find a corresponding state in the - # database, which should never happen (at least we should have the - # tail state in there!) - error "Couldn't find ancestor state root!", - blockRoot = shortLog(bs.blck.root), - blockSlot = shortLog(bs.blck.slot), - slot = shortLog(bs.slot), - cat = "crash" - doAssert false, "Oh noes, we passed big bang!" - - let - ancestor = ancestors.pop() - root = stateRoot.get() - found = pool.getState(pool.db, root, ancestor.refs, state) - - if not found: - # TODO this should only happen if the database is corrupt - we walked the - # list of parent blocks and couldn't find a corresponding state in the - # database, which should never happen (at least we should have the - # tail state in there!) - error "Couldn't find ancestor state or block parent missing!", - blockRoot = shortLog(bs.blck.root), - blockSlot = shortLog(bs.blck.slot), - slot = shortLog(bs.slot), - cat = "crash" - doAssert false, "Oh noes, we passed big bang!" - - trace "Replaying state transitions", - stateSlot = shortLog(state.data.data.slot), - ancestorStateRoot = shortLog(ancestor.data.message.state_root), - ancestorStateSlot = shortLog(state.data.data.slot), - slot = shortLog(bs.slot), - blockRoot = shortLog(bs.blck.root), - ancestors = ancestors.len, - cat = "replay_state" - - ancestors - -proc getStateDataCached(pool: BlockPool, state: var StateData, bs: BlockSlot): bool = - # This pointedly does not run rewindState or state_transition, but otherwise - # mostly matches updateStateData(...), because it's too expensive to run the - # rewindState(...)/skipAndUpdateState(...)/state_transition(...) procs, when - # each hash_tree_root(...) consumes a nontrivial fraction of a second. - when false: - # For debugging/development purposes to assess required lookback window for - # any given use case. - doAssert state.data.data.slot <= bs.slot + 4 - - let idx = pool.getStateCacheIndex(bs.blck.root, bs.slot) - if idx >= 0: - state.data = pool.cachedStates[idx].state[] - state.blck = bs.blck - beacon_state_data_cache_hits.inc() - return true - - # In-memory caches didn't hit. Try main blockpool database. This is slower - # than the caches due to SSZ (de)serializing and disk I/O, so prefer them. - beacon_state_data_cache_misses.inc() - if (let tmp = pool.db.getStateRoot(bs.blck.root, bs.slot); tmp.isSome()): - return pool.getState(pool.db, tmp.get(), bs.blck, state) - - false - -proc updateStateData*(pool: BlockPool, state: var StateData, bs: BlockSlot) = - ## Rewind or advance state such that it matches the given block and slot - - ## this may include replaying from an earlier snapshot if blck is on a - ## different branch or has advanced to a higher slot number than slot - ## If slot is higher than blck.slot, replay will fill in with empty/non-block - ## slots, else it is ignored - - # We need to check the slot because the state might have moved forwards - # without blocks - if state.blck.root == bs.blck.root and state.data.data.slot <= bs.slot: - if state.data.data.slot != bs.slot: - # Might be that we're moving to the same block but later slot - pool.skipAndUpdateState(state.data, bs.blck, bs.slot, true) - - return # State already at the right spot - - if pool.getStateDataCached(state, bs): - return - - let ancestors = rewindState(pool, state, bs) - - # If we come this far, we found the state root. The last block on the stack - # is the one that produced this particular state, so we can pop it - # TODO it might be possible to use the latest block hashes from the state to - # do this more efficiently.. whatever! - - # Time to replay all the blocks between then and now. We skip one because - # it's the one that we found the state with, and it has already been - # applied. Pathologically quadratic in slot number, naïvely. - for i in countdown(ancestors.len - 1, 0): - # Because the ancestors are in the database, there's no need to persist them - # again. Also, because we're applying blocks that were loaded from the - # database, we can skip certain checks that have already been performed - # before adding the block to the database. In particular, this means that - # no state root calculation will take place here, because we can load - # the final state root from the block itself. - let ok = - pool.skipAndUpdateState( - state, ancestors[i], - {skipBlsValidation, skipMerkleValidation, skipStateRootValidation}, - false) - doAssert ok, "Blocks in database should never fail to apply.." - - # We save states here - blocks were guaranteed to have passed through the save - # function once at least, but not so for empty slots! - pool.skipAndUpdateState(state.data, bs.blck, bs.slot, true) - - state.blck = bs.blck - -proc loadTailState*(pool: BlockPool): StateData = - ## Load the state associated with the current tail in the pool - let stateRoot = pool.db.getBlock(pool.tail.root).get().message.state_root - let found = pool.getState(pool.db, stateRoot, pool.tail, result) - # TODO turn into regular error, this can happen - doAssert found, "Failed to load tail state, database corrupt?" - -proc delState(pool: BlockPool, bs: BlockSlot) = - # Delete state state and mapping for a particular block+slot - if (let root = pool.db.getStateRoot(bs.blck.root, bs.slot); root.isSome()): - pool.db.delState(root.get()) + getOrResolve(pool.dag, pool.quarantine, root) proc updateHead*(pool: BlockPool, newHead: BlockRef) = ## Update what we consider to be the current head, as given by the fork @@ -873,309 +117,57 @@ proc updateHead*(pool: BlockPool, newHead: BlockRef) = ## of operations naturally becomes important here - after updating the head, ## blocks that were once considered potential candidates for a tree will ## now fall from grace, or no longer be considered resolved. - doAssert newHead.parent != nil or newHead.slot == 0 - logScope: pcs = "fork_choice" + updateHead(pool.dag, newHead) - if pool.head.blck == newHead: - info "No head block update", - head = shortLog(newHead), - cat = "fork_choice" - - return - - let - lastHead = pool.head - pool.db.putHeadBlock(newHead.root) - - # Start off by making sure we have the right state - updateStateData( - pool, pool.headState, BlockSlot(blck: newHead, slot: newHead.slot)) - - let - justifiedSlot = pool.headState.data.data - .current_justified_checkpoint - .epoch - .compute_start_slot_at_epoch() - justifiedBS = newHead.atSlot(justifiedSlot) - - pool.head = Head(blck: newHead, justified: justifiedBS) - updateStateData(pool, pool.justifiedState, justifiedBS) - - # TODO isAncestorOf may be expensive - too expensive? - if not lastHead.blck.isAncestorOf(newHead): - info "Updated head block (new parent)", - lastHead = shortLog(lastHead.blck), - headParent = shortLog(newHead.parent), - stateRoot = shortLog(pool.headState.data.root), - headBlock = shortLog(pool.headState.blck), - stateSlot = shortLog(pool.headState.data.data.slot), - justifiedEpoch = shortLog(pool.headState.data.data.current_justified_checkpoint.epoch), - finalizedEpoch = shortLog(pool.headState.data.data.finalized_checkpoint.epoch), - cat = "fork_choice" - - # A reasonable criterion for "reorganizations of the chain" - beacon_reorgs_total.inc() - else: - info "Updated head block", - stateRoot = shortLog(pool.headState.data.root), - headBlock = shortLog(pool.headState.blck), - stateSlot = shortLog(pool.headState.data.data.slot), - justifiedEpoch = shortLog(pool.headState.data.data.current_justified_checkpoint.epoch), - finalizedEpoch = shortLog(pool.headState.data.data.finalized_checkpoint.epoch), - cat = "fork_choice" - - let - finalizedEpochStartSlot = - pool.headState.data.data.finalized_checkpoint.epoch. - compute_start_slot_at_epoch() - # TODO there might not be a block at the epoch boundary - what then? - finalizedHead = newHead.atSlot(finalizedEpochStartSlot) - - doAssert (not finalizedHead.blck.isNil), - "Block graph should always lead to a finalized block" - - if finalizedHead != pool.finalizedHead: - block: # Remove states, walking slot by slot - discard - # TODO this is very aggressive - in theory all our operations start at - # the finalized block so all states before that can be wiped.. - # TODO this is disabled for now because the logic for initializing the - # block pool and potentially a few other places depend on certain - # states (like the tail state) being present. It's also problematic - # because it is not clear what happens when tail and finalized states - # happen on an empty slot.. - # var cur = finalizedHead - # while cur != pool.finalizedHead: - # cur = cur.parent - # pool.delState(cur) - - block: # Clean up block refs, walking block by block - var cur = finalizedHead.blck - while cur != pool.finalizedHead.blck: - # Finalization means that we choose a single chain as the canonical one - - # it also means we're no longer interested in any branches from that chain - # up to the finalization point. - # The new finalized head should not be cleaned! We start at its parent and - # clean everything including the old finalized head. - cur = cur.parent - - # TODO what about attestations? we need to drop those too, though they - # *should* be pretty harmless - if cur.parent != nil: # This happens for the genesis / tail block - for child in cur.parent.children: - if child != cur: - # TODO also remove states associated with the unviable forks! - # TODO the easiest thing to do here would probably be to use - # pool.heads to find unviable heads, then walk those chains - # and remove everything.. currently, if there's a child with - # children of its own, those children will not be pruned - # correctly from the database - pool.blocks.del(child.root) - pool.db.delBlock(child.root) - cur.parent.children = @[cur] - - pool.finalizedHead = finalizedHead - - let hlen = pool.heads.len - for i in 0.. 0, - "We should have at least the genesis block in heaads" - doAssert (not pool.head.blck.isNil()), - "Genesis block will be head, if nothing else" - - # Prefer stability: use justified block from current head to break ties! - result = pool.head.justified - for head in pool.heads[1 ..< ^0]: - if head.justified.slot > result.slot: - result = head.justified + latestJustifiedBlock(pool.dag) proc isInitialized*(T: type BlockPool, db: BeaconChainDB): bool = - let - headBlockRoot = db.getHeadBlock() - tailBlockRoot = db.getTailBlock() - - if not (headBlockRoot.isSome() and tailBlockRoot.isSome()): - return false - - let - headBlock = db.getBlock(headBlockRoot.get()) - tailBlock = db.getBlock(tailBlockRoot.get()) - - if not (headBlock.isSome() and tailBlock.isSome()): - return false - - if not db.containsState(tailBlock.get().message.state_root): - return false - - return true + isInitialized(CandidateChains, db) proc preInit*( T: type BlockPool, db: BeaconChainDB, state: BeaconState, signedBlock: SignedBeaconBlock) = - # write a genesis state, the way the BlockPool expects it to be stored in - # database - # TODO probably should just init a blockpool with the freshly written - # state - but there's more refactoring needed to make it nice - doing - # a minimal patch for now.. - let - blockRoot = hash_tree_root(signedBlock.message) - - doAssert signedBlock.message.state_root == hash_tree_root(state) - notice "New database from snapshot", - blockRoot = shortLog(blockRoot), - stateRoot = shortLog(signedBlock.message.state_root), - fork = state.fork, - validators = state.validators.len(), - cat = "initialization" - - db.putState(state) - db.putBlock(signedBlock) - db.putTailBlock(blockRoot) - db.putHeadBlock(blockRoot) - db.putStateRoot(blockRoot, state.slot, signedBlock.message.state_root) + preInit(CandidateChains, db, state, signedBlock) proc getProposer*(pool: BlockPool, head: BlockRef, slot: Slot): Option[ValidatorPubKey] = - pool.withState(pool.tmpState, head.atSlot(slot)): - var cache = get_empty_per_epoch_cache() + getProposer(pool.dag, head, slot) - # https://github.com/ethereum/eth2.0-specs/blob/v0.11.1/specs/phase0/validator.md#validator-assignments - let proposerIdx = get_beacon_proposer_index(state, cache) - if proposerIdx.isNone: - warn "Missing proposer index", - slot=slot, - epoch=slot.compute_epoch_at_slot, - num_validators=state.validators.len, - active_validators= - get_active_validator_indices(state, slot.compute_epoch_at_slot), - balances=state.balances - return +# Rewinder / State transitions +# -------------------------------------------- - return some(state.validators[proposerIdx.get()].pubkey) +template headState*(pool: BlockPool): StateData = + pool.dag.headState + +template tmpState*(pool: BlockPool): StateData = + pool.dag.tmpState + +template justifiedState*(pool: BlockPool): StateData = + pool.dag.justifiedState + +template withState*( + pool: BlockPool, cache: var StateData, blockSlot: BlockSlot, body: untyped): untyped = + ## Helper template that updates state to a particular BlockSlot - usage of + ## cache is unsafe outside of block. + ## TODO async transformations will lead to a race where cache gets updated + ## while waiting for future to complete - catch this here somehow? + + withState(pool.dag, cache, blockSlot, body) + +proc updateStateData*(pool: BlockPool, state: var StateData, bs: BlockSlot) = + ## Rewind or advance state such that it matches the given block and slot - + ## this may include replaying from an earlier snapshot if blck is on a + ## different branch or has advanced to a higher slot number than slot + ## If slot is higher than blck.slot, replay will fill in with empty/non-block + ## slots, else it is ignored + updateStateData(pool.dag, state, bs) + +proc loadTailState*(pool: BlockPool): StateData = + loadTailState(pool.dag) -# https://github.com/ethereum/eth2.0-specs/blob/v0.11.1/specs/phase0/p2p-interface.md#global-topics proc isValidBeaconBlock*(pool: var BlockPool, - signed_beacon_block: SignedBeaconBlock, current_slot: Slot, - flags: UpdateFlags): bool = - # In general, checks are ordered from cheap to expensive. Especially, crypto - # verification could be quite a bit more expensive than the rest. This is an - # externally easy-to-invoke function by tossing network packets at the node. - - # The block is not from a future slot - # TODO allow `MAXIMUM_GOSSIP_CLOCK_DISPARITY` leniency, especially towards - # seemingly future slots. - if not (signed_beacon_block.message.slot <= current_slot): - debug "isValidBeaconBlock: block is from a future slot", - signed_beacon_block_message_slot = signed_beacon_block.message.slot, - current_slot = current_slot - return false - - # The block is from a slot greater than the latest finalized slot (with a - # MAXIMUM_GOSSIP_CLOCK_DISPARITY allowance) -- i.e. validate that - # signed_beacon_block.message.slot > - # compute_start_slot_at_epoch(state.finalized_checkpoint.epoch) - if not (signed_beacon_block.message.slot > pool.finalizedHead.slot): - debug "isValidBeaconBlock: block is not from a slot greater than the latest finalized slot" - return false - - # The block is the first block with valid signature received for the proposer - # for the slot, signed_beacon_block.message.slot. - # - # While this condition is similar to the proposer slashing condition at - # https://github.com/ethereum/eth2.0-specs/blob/v0.11.1/specs/phase0/validator.md#proposer-slashing - # it's not identical, and this check does not address slashing: - # - # (1) The beacon blocks must be conflicting, i.e. different, for the same - # slot and proposer. This check also catches identical blocks. - # - # (2) By this point in the function, it's not been checked whether they're - # signed yet. As in general, expensive checks should be deferred, this - # would add complexity not directly relevant this function. - # - # (3) As evidenced by point (1), the similarity in the validation condition - # and slashing condition, while not coincidental, aren't similar enough - # to combine, as one or the other might drift. - # - # (4) Furthermore, this function, as much as possible, simply returns a yes - # or no answer, without modifying other state for p2p network interface - # validation. Complicating this interface, for the sake of sharing only - # couple lines of code, wouldn't be worthwhile. - # - # TODO might check unresolved/orphaned blocks too, and this might not see all - # blocks at a given slot (though, in theory, those get checked elsewhere), or - # adding metrics that count how often these conditions occur. - let - slotBlockRef = getBlockBySlot(pool, signed_beacon_block.message.slot) - - if not slotBlockRef.isNil: - let blck = pool.get(slotBlockRef).data - if blck.message.proposer_index == - signed_beacon_block.message.proposer_index and - blck.message.slot == signed_beacon_block.message.slot and - blck.signature.toRaw() != signed_beacon_block.signature.toRaw(): - debug "isValidBeaconBlock: block isn't first block with valid signature received for the proposer", - signed_beacon_block_message_slot = signed_beacon_block.message.slot, - blckRef = slotBlockRef, - received_block = shortLog(signed_beacon_block.message), - existing_block = shortLog(pool.get(slotBlockRef).data.message) - return false - - # If this block doesn't have a parent we know about, we can't/don't really - # trace it back to a known-good state/checkpoint to verify its prevenance; - # while one could getOrResolve to queue up searching for missing parent it - # might not be the best place. As much as feasible, this function aims for - # answering yes/no, not queuing other action or otherwise altering state. - let parent_ref = pool.getRef(signed_beacon_block.message.parent_root) - if parent_ref.isNil: - # This doesn't mean a block is forever invalid, only that we haven't seen - # its ancestor blocks yet. While that means for now it should be blocked, - # at least, from libp2p propagation, it shouldn't be ignored. TODO, if in - # the future this block moves from pending to being resolved, consider if - # it's worth broadcasting it then. - - # Pending pool gets checked via `BlockPool.add(...)` later, and relevant - # checks are performed there. In usual paths beacon_node adds blocks via - # BlockPool.add(...) directly, with no additional validity checks. TODO, - # not specific to this, but by the pending pool keying on the htr of the - # BeaconBlock, not SignedBeaconBlock, opens up certain spoofing attacks. - pool.pending[hash_tree_root(signed_beacon_block.message)] = - signed_beacon_block - return false - - # The proposer signature, signed_beacon_block.signature, is valid with - # respect to the proposer_index pubkey. - let bs = - BlockSlot(blck: parent_ref, slot: pool.get(parent_ref).data.message.slot) - pool.withState(pool.tmpState, bs): - let - blockRoot = hash_tree_root(signed_beacon_block.message) - domain = get_domain(pool.headState.data.data, DOMAIN_BEACON_PROPOSER, - compute_epoch_at_slot(signed_beacon_block.message.slot)) - signing_root = compute_signing_root(blockRoot, domain) - proposer_index = signed_beacon_block.message.proposer_index - - if proposer_index >= pool.headState.data.data.validators.len.uint64: - return false - if not blsVerify(pool.headState.data.data.validators[proposer_index].pubkey, - signing_root.data, signed_beacon_block.signature): - debug "isValidBeaconBlock: block failed signature verification" - return false - - true + signed_beacon_block: SignedBeaconBlock, + current_slot: Slot, flags: UpdateFlags): bool = + isValidBeaconBlock(pool.dag, pool.quarantine, signed_beacon_block, current_slot, flags) diff --git a/beacon_chain/block_pools/block_pools_types.nim b/beacon_chain/block_pools/block_pools_types.nim new file mode 100644 index 000000000..c2f5eee3e --- /dev/null +++ b/beacon_chain/block_pools/block_pools_types.nim @@ -0,0 +1,168 @@ +# beacon_chain +# Copyright (c) 2018-2020 Status Research & Development GmbH +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + deques, tables, + stew/[endians2, byteutils], chronicles, + ../spec/[datatypes, crypto, digest], + ../beacon_chain_db, ../extras + +# ############################################# +# +# Quarantine & DAG +# +# ############################################# +# +# The Quarantine and DagChain data structures +# keeps track respectively of unsafe blocks coming from the network +# and blocks that underwent verification and have a resolved path to +# the last finalized block known. + +type + Quarantine* = object + ## Keeps track of unsafe blocks coming from the network + ## and that cannot be added to the chain + ## + ## This only stores valid blocks that cannot be linked to the BlockPool DAG + ## due to missing ancestor(s). + ## + ## Invalid blocks are dropped immediately. + + pending*: Table[Eth2Digest, SignedBeaconBlock] ##\ + ## Blocks that have passed validation but that we lack a link back to tail + ## for - when we receive a "missing link", we can use this data to build + ## an entire branch + + missing*: Table[Eth2Digest, MissingBlock] ##\ + ## Roots of blocks that we would like to have (either parent_root of + ## unresolved blocks or block roots of attestations) + + inAdd*: bool + + MissingBlock* = object + slots*: uint64 # number of slots that are suspected missing + tries*: int + + FetchRecord* = object + root*: Eth2Digest + historySlots*: uint64 + + CandidateChains* = ref object + ## Pool of blocks responsible for keeping a DAG of resolved blocks. + ## + ## The BlockPool is responsible for the following + ## + ## - Handle requests and updates to the "ColdDB" which + ## holds the canonical chain. + ## - Maintain a direct acyclic graph (DAG) of + ## candidate chains from the last + ## finalized block. + ## + ## When a chain becomes finalized, it is saved in the ColdDB, + ## the rejected candidates are discard and the BlockPool + ## is CandidateChains is pruned, only keeping the last finalized block. + ## + ## The last finalized block is called the tail block. + + # ----------------------------------- + # ColdDB - Canonical chain + + db*: BeaconChainDB ##\ + ## ColdDB - Stores the canonical chain + + # ----------------------------------- + # CandidateChains - DAG of candidate chains + + blocks*: Table[Eth2Digest, BlockRef] ##\ + ## Directed acyclic graph of blocks pointing back to a finalized block on the chain we're + ## interested in - we call that block the tail + + tail*: BlockRef ##\ + ## The earliest finalized block we know about + + heads*: seq[Head] ##\ + ## Candidate heads of candidate chains + + head*: Head ##\ + ## The latest block we know about, that's been chosen as a head by the fork + ## choice rule + + finalizedHead*: BlockSlot ##\ + ## The latest block that was finalized according to the block in head + ## Ancestors of this block are guaranteed to have 1 child only. + + # ----------------------------------- + # Rewinder - Mutable state processing + + cachedStates*: seq[tuple[blockRoot: Eth2Digest, slot: Slot, + state: ref HashedBeaconState]] + + headState*: StateData ##\ + ## State given by the head block; only update in `updateHead`, not anywhere + ## else via `withState` + + justifiedState*: StateData ## Latest justified state, as seen from the head + + tmpState*: StateData ## Scratchpad - may be any state + + updateFlags*: UpdateFlags + + BlockRef* = ref object + ## Node in object graph guaranteed to lead back to tail block, and to have + ## a corresponding entry in database. + ## Block graph should form a tree - in particular, there are no cycles. + + root*: Eth2Digest ##\ + ## Root that can be used to retrieve block data from database + + parent*: BlockRef ##\ + ## Not nil, except for the tail + + children*: seq[BlockRef] + # TODO do we strictly need this? + + slot*: Slot # TODO could calculate this by walking to root, but.. + + BlockData* = object + ## Body and graph in one + + data*: SignedBeaconBlock + refs*: BlockRef + + StateData* = object + data*: HashedBeaconState + + blck*: BlockRef ##\ + ## The block associated with the state found in data - normally + ## `blck.state_root == data.root` but the state might have been advanced + ## further with empty slots invalidating this condition. + + BlockSlot* = object + ## Unique identifier for a particular fork and time in the block chain - + ## normally, there's a block for every slot, but in the case a block is not + ## produced, the chain progresses anyway, producing a new state for every + ## slot. + blck*: BlockRef + slot*: Slot ##\ + ## Slot time for this BlockSlot which may differ from blck.slot when time + ## has advanced without blocks + + Head* = object + blck*: BlockRef + justified*: BlockSlot + +proc shortLog*(v: BlockSlot): string = + if v.blck.slot == v.slot: + v.blck.root.data[0..3].toHex() & ":" & $v.blck.slot + else: # There was a gap - log it + v.blck.root.data[0..3].toHex() & ":" & $v.blck.slot & "@" & $v.slot + +proc shortLog*(v: BlockRef): string = + v.root.data[0..3].toHex() & ":" & $v.slot + +chronicles.formatIt BlockSlot: shortLog(it) +chronicles.formatIt BlockRef: shortLog(it) diff --git a/beacon_chain/block_pools/candidate_chains.nim b/beacon_chain/block_pools/candidate_chains.nim new file mode 100644 index 000000000..eece3ef7c --- /dev/null +++ b/beacon_chain/block_pools/candidate_chains.nim @@ -0,0 +1,841 @@ +# beacon_chain +# Copyright (c) 2018-2020 Status Research & Development GmbH +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +{.push raises: [Defect].} + +import + chronicles, options, tables, + metrics, + ../ssz, ../beacon_chain_db, ../state_transition, ../extras, + ../spec/[crypto, datatypes, digest, helpers, validator], + block_pools_types + +declareCounter beacon_reorgs_total, "Total occurrences of reorganizations of the chain" # On fork choice +declareCounter beacon_state_data_cache_hits, "dag.cachedStates hits" +declareCounter beacon_state_data_cache_misses, "dag.cachedStates misses" + +logScope: topics = "hotdb" + +proc putBlock*(dag: var CandidateChains, blockRoot: Eth2Digest, signedBlock: SignedBeaconBlock) {.inline.} = + dag.db.putBlock(blockRoot, signedBlock) + +proc updateStateData*( + dag: CandidateChains, state: var StateData, bs: BlockSlot) {.gcsafe.} + +template withState*( + dag: CandidateChains, cache: var StateData, blockSlot: BlockSlot, body: untyped): untyped = + ## Helper template that updates state to a particular BlockSlot - usage of + ## cache is unsafe outside of block. + ## TODO async transformations will lead to a race where cache gets updated + ## while waiting for future to complete - catch this here somehow? + + updateStateData(dag, cache, blockSlot) + + template hashedState(): HashedBeaconState {.inject, used.} = cache.data + template state(): BeaconState {.inject, used.} = cache.data.data + template blck(): BlockRef {.inject, used.} = cache.blck + template root(): Eth2Digest {.inject, used.} = cache.data.root + + body + +func parent*(bs: BlockSlot): BlockSlot = + ## Return a blockslot representing the previous slot, using the parent block + ## if the current slot had a block + if bs.slot == Slot(0): + BlockSlot(blck: nil, slot: Slot(0)) + else: + BlockSlot( + blck: if bs.slot > bs.blck.slot: bs.blck else: bs.blck.parent, + slot: bs.slot - 1 + ) + +func link*(parent, child: BlockRef) = + doAssert (not (parent.root == Eth2Digest() or child.root == Eth2Digest())), + "blocks missing root!" + doAssert parent.root != child.root, "self-references not allowed" + + child.parent = parent + parent.children.add(child) + +func isAncestorOf*(a, b: BlockRef): bool = + var b = b + var depth = 0 + const maxDepth = (100'i64 * 365 * 24 * 60 * 60 div SECONDS_PER_SLOT.int) + while true: + if a == b: return true + + # for now, use an assert for block chain length since a chain this long + # indicates a circular reference here.. + doAssert depth < maxDepth + depth += 1 + + if a.slot >= b.slot or b.parent.isNil: + return false + + doAssert b.slot > b.parent.slot + b = b.parent + +func getAncestorAt*(blck: BlockRef, slot: Slot): BlockRef = + ## Return the most recent block as of the time at `slot` that not more recent + ## than `blck` itself + + var blck = blck + + var depth = 0 + const maxDepth = (100'i64 * 365 * 24 * 60 * 60 div SECONDS_PER_SLOT.int) + + while true: + if blck.slot <= slot: + return blck + + if blck.parent.isNil: + return nil + + doAssert depth < maxDepth + depth += 1 + + blck = blck.parent + +func get_ancestor*(blck: BlockRef, slot: Slot): BlockRef = + ## https://github.com/ethereum/eth2.0-specs/blob/v0.11.1/specs/phase0/fork-choice.md#get_ancestor + ## Return ancestor at slot, or nil if queried block is older + var blck = blck + + var depth = 0 + const maxDepth = (100'i64 * 365 * 24 * 60 * 60 div SECONDS_PER_SLOT.int) + + while true: + if blck.slot == slot: + return blck + + if blck.slot < slot: + return nil + + if blck.parent.isNil: + return nil + + doAssert depth < maxDepth + depth += 1 + + blck = blck.parent + +func atSlot*(blck: BlockRef, slot: Slot): BlockSlot = + ## Return a BlockSlot at a given slot, with the block set to the closest block + ## available. If slot comes from before the block, a suitable block ancestor + ## will be used, else blck is returned as if all slots after it were empty. + ## This helper is useful when imagining what the chain looked like at a + ## particular moment in time, or when imagining what it will look like in the + ## near future if nothing happens (such as when looking ahead for the next + ## block proposal) + BlockSlot(blck: blck.getAncestorAt(slot), slot: slot) + +func init*(T: type BlockRef, root: Eth2Digest, slot: Slot): BlockRef = + BlockRef( + root: root, + slot: slot + ) + +func init*(T: type BlockRef, root: Eth2Digest, blck: BeaconBlock): BlockRef = + BlockRef.init(root, blck.slot) + +proc init*(T: type CandidateChains, db: BeaconChainDB, + updateFlags: UpdateFlags = {}): CandidateChains = + # TODO we require that the db contains both a head and a tail block - + # asserting here doesn't seem like the right way to go about it however.. + + let + tailBlockRoot = db.getTailBlock() + headBlockRoot = db.getHeadBlock() + + doAssert tailBlockRoot.isSome(), "Missing tail block, database corrupt?" + doAssert headBlockRoot.isSome(), "Missing head block, database corrupt?" + + let + tailRoot = tailBlockRoot.get() + tailBlock = db.getBlock(tailRoot).get() + tailRef = BlockRef.init(tailRoot, tailBlock.message) + headRoot = headBlockRoot.get() + + var + blocks = {tailRef.root: tailRef}.toTable() + headRef: BlockRef + + if headRoot != tailRoot: + var curRef: BlockRef + + for root, blck in db.getAncestors(headRoot): + if root == tailRef.root: + doAssert(not curRef.isNil) + link(tailRef, curRef) + curRef = curRef.parent + break + + let newRef = BlockRef.init(root, blck.message) + if curRef == nil: + curRef = newRef + headRef = newRef + else: + link(newRef, curRef) + curRef = curRef.parent + blocks[curRef.root] = curRef + trace "Populating block dag", key = curRef.root, val = curRef + + doAssert curRef == tailRef, + "head block does not lead to tail, database corrupt?" + else: + headRef = tailRef + + var + bs = headRef.atSlot(headRef.slot) + tmpState = (ref StateData)() + + # Now that we have a head block, we need to find the most recent state that + # we have saved in the database + while bs.blck != nil: + let root = db.getStateRoot(bs.blck.root, bs.slot) + if root.isSome(): + # TODO load StateData from BeaconChainDB + # We save state root separately for empty slots which means we might + # sometimes not find a state even though we saved its state root + if db.getState(root.get(), tmpState.data.data, noRollback): + tmpState.data.root = root.get() + tmpState.blck = bs.blck + + break + + bs = bs.parent() # Iterate slot by slot in case there's a gap! + + if tmpState.blck == nil: + warn "No state found in head history, database corrupt?" + # TODO Potentially we could recover from here instead of crashing - what + # would be a good recovery model? + raiseAssert "No state found in head history, database corrupt?" + + # We presently save states on the epoch boundary - it means that the latest + # state we loaded might be older than head block - nonetheless, it will be + # from the same epoch as the head, thus the finalized and justified slots are + # the same - these only change on epoch boundaries. + let + finalizedSlot = + tmpState.data.data.finalized_checkpoint.epoch.compute_start_slot_at_epoch() + finalizedHead = headRef.atSlot(finalizedSlot) + justifiedSlot = + tmpState.data.data.current_justified_checkpoint.epoch.compute_start_slot_at_epoch() + justifiedHead = headRef.atSlot(justifiedSlot) + head = Head(blck: headRef, justified: justifiedHead) + + doAssert justifiedHead.slot >= finalizedHead.slot, + "justified head comes before finalized head - database corrupt?" + + let res = CandidateChains( + blocks: blocks, + tail: tailRef, + head: head, + finalizedHead: finalizedHead, + db: db, + heads: @[head], + headState: tmpState[], + justifiedState: tmpState[], # This is wrong but we'll update it below + tmpState: tmpState[], + + # The only allowed flag right now is verifyFinalization, as the others all + # allow skipping some validation. + updateFlags: {verifyFinalization} * updateFlags + ) + + doAssert res.updateFlags in [{}, {verifyFinalization}] + + res.updateStateData(res.justifiedState, justifiedHead) + res.updateStateData(res.headState, headRef.atSlot(headRef.slot)) + + info "Block dag initialized", + head = head.blck, justifiedHead, finalizedHead, tail = tailRef, + totalBlocks = blocks.len + + res + +proc getState( + dag: CandidateChains, db: BeaconChainDB, stateRoot: Eth2Digest, blck: BlockRef, + output: var StateData): bool = + let outputAddr = unsafeAddr output # local scope + func restore(v: var BeaconState) = + if outputAddr == (unsafeAddr dag.headState): + # TODO seeing the headState in the restore shouldn't happen - we load + # head states only when updating the head position, and by that time + # the database will have gone through enough sanity checks that + # SSZ exceptions shouldn't happen, which is when restore happens. + # Nonetheless, this is an ugly workaround that needs to go away + doAssert false, "Cannot alias headState" + + outputAddr[] = dag.headState + + if not db.getState(stateRoot, output.data.data, restore): + return false + + output.blck = blck + output.data.root = stateRoot + + true + +func getStateCacheIndex(dag: CandidateChains, blockRoot: Eth2Digest, slot: Slot): int = + for i, cachedState in dag.cachedStates: + let (cacheBlockRoot, cacheSlot, state) = cachedState + if cacheBlockRoot == blockRoot and cacheSlot == slot: + return i + + -1 + +proc putState*(dag: CandidateChains, state: HashedBeaconState, blck: BlockRef) = + # TODO we save state at every epoch start but never remove them - we also + # potentially save multiple states per slot if reorgs happen, meaning + # we could easily see a state explosion + logScope: pcs = "save_state_at_epoch_start" + + var rootWritten = false + if state.data.slot != blck.slot: + # This is a state that was produced by a skip slot for which there is no + # block - we'll save the state root in the database in case we need to + # replay the skip + dag.db.putStateRoot(blck.root, state.data.slot, state.root) + rootWritten = true + + if state.data.slot.isEpoch: + if not dag.db.containsState(state.root): + info "Storing state", + blck = shortLog(blck), + stateSlot = shortLog(state.data.slot), + stateRoot = shortLog(state.root), + cat = "caching" + dag.db.putState(state.root, state.data) + if not rootWritten: + dag.db.putStateRoot(blck.root, state.data.slot, state.root) + + # Need to be able to efficiently access states for both attestation + # aggregation and to process block proposals going back to the last + # finalized slot. Ideally to avoid potential combinatiorial forking + # storage and/or memory constraints could CoW, up to and including, + # in particular, hash_tree_root() which is expensive to do 30 times + # since the previous epoch, to efficiently state_transition back to + # desired slot. However, none of that's in place, so there are both + # expensive, repeated BeaconState copies as well as computationally + # time-consuming-near-end-of-epoch hash tree roots. The latter are, + # effectively, naïvely O(n^2) in slot number otherwise, so when the + # slots become in the mid-to-high-20s it's spending all its time in + # pointlessly repeated calculations of prefix-state-transitions. An + # intermediate time/memory workaround involves storing only mapping + # between BlockRefs, or BlockSlots, and the BeaconState tree roots, + # but that still involves tens of megabytes worth of copying, along + # with the concomitant memory allocator and GC load. Instead, use a + # more memory-intensive (but more conceptually straightforward, and + # faster) strategy to just store, for the most recent slots. + let stateCacheIndex = dag.getStateCacheIndex(blck.root, state.data.slot) + if stateCacheIndex == -1: + # Could use a deque or similar, but want simpler structure, and the data + # items are small and few. + const MAX_CACHE_SIZE = 32 + insert(dag.cachedStates, (blck.root, state.data.slot, newClone(state))) + while dag.cachedStates.len > MAX_CACHE_SIZE: + discard dag.cachedStates.pop() + let cacheLen = dag.cachedStates.len + trace "CandidateChains.putState(): state cache updated", cacheLen + doAssert cacheLen > 0 and cacheLen <= MAX_CACHE_SIZE + +func getRef*(dag: CandidateChains, root: Eth2Digest): BlockRef = + ## Retrieve a resolved block reference, if available + dag.blocks.getOrDefault(root, nil) + +func getBlockRange*( + dag: CandidateChains, startSlot: Slot, skipStep: Natural, + output: var openArray[BlockRef]): Natural = + ## This function populates an `output` buffer of blocks + ## with a slots ranging from `startSlot` up to, but not including, + ## `startSlot + skipStep * output.len`, skipping any slots that don't have + ## a block. + ## + ## Blocks will be written to `output` from the end without gaps, even if + ## a block is missing in a particular slot. The return value shows how + ## many slots were missing blocks - to iterate over the result, start + ## at this index. + ## + ## If there were no blocks in the range, `output.len` will be returned. + let count = output.len + trace "getBlockRange entered", + head = shortLog(dag.head.blck.root), count, startSlot, skipStep + + let + skipStep = max(1, skipStep) # Treat 0 step as 1 + endSlot = startSlot + uint64(count * skipStep) + + var + b = dag.head.blck.atSlot(endSlot) + o = count + for i in 0..= 0: + state.data = dag.cachedStates[idx].state[] + let ancestor = ancestors.pop() + state.blck = ancestor.refs + + beacon_state_data_cache_hits.inc() + trace "Replaying state transitions via in-memory cache", + stateSlot = shortLog(state.data.data.slot), + ancestorStateRoot = shortLog(ancestor.data.message.state_root), + ancestorStateSlot = shortLog(state.data.data.slot), + slot = shortLog(bs.slot), + blockRoot = shortLog(bs.blck.root), + ancestors = ancestors.len, + cat = "replay_state" + + return ancestors + + beacon_state_data_cache_misses.inc() + if (let tmp = dag.db.getStateRoot(parBs.blck.root, parBs.slot); tmp.isSome()): + if dag.db.containsState(tmp.get): + stateRoot = tmp + break + + curBs = parBs + + if stateRoot.isNone(): + # TODO this should only happen if the database is corrupt - we walked the + # list of parent blocks and couldn't find a corresponding state in the + # database, which should never happen (at least we should have the + # tail state in there!) + error "Couldn't find ancestor state root!", + blockRoot = shortLog(bs.blck.root), + blockSlot = shortLog(bs.blck.slot), + slot = shortLog(bs.slot), + cat = "crash" + doAssert false, "Oh noes, we passed big bang!" + + let + ancestor = ancestors.pop() + root = stateRoot.get() + found = dag.getState(dag.db, root, ancestor.refs, state) + + if not found: + # TODO this should only happen if the database is corrupt - we walked the + # list of parent blocks and couldn't find a corresponding state in the + # database, which should never happen (at least we should have the + # tail state in there!) + error "Couldn't find ancestor state or block parent missing!", + blockRoot = shortLog(bs.blck.root), + blockSlot = shortLog(bs.blck.slot), + slot = shortLog(bs.slot), + cat = "crash" + doAssert false, "Oh noes, we passed big bang!" + + trace "Replaying state transitions", + stateSlot = shortLog(state.data.data.slot), + ancestorStateRoot = shortLog(ancestor.data.message.state_root), + ancestorStateSlot = shortLog(state.data.data.slot), + slot = shortLog(bs.slot), + blockRoot = shortLog(bs.blck.root), + ancestors = ancestors.len, + cat = "replay_state" + + ancestors + +proc getStateDataCached(dag: CandidateChains, state: var StateData, bs: BlockSlot): bool = + # This pointedly does not run rewindState or state_transition, but otherwise + # mostly matches updateStateData(...), because it's too expensive to run the + # rewindState(...)/skipAndUpdateState(...)/state_transition(...) procs, when + # each hash_tree_root(...) consumes a nontrivial fraction of a second. + when false: + # For debugging/development purposes to assess required lookback window for + # any given use case. + doAssert state.data.data.slot <= bs.slot + 4 + + let idx = dag.getStateCacheIndex(bs.blck.root, bs.slot) + if idx >= 0: + state.data = dag.cachedStates[idx].state[] + state.blck = bs.blck + beacon_state_data_cache_hits.inc() + return true + + # In-memory caches didn't hit. Try main blockpool database. This is slower + # than the caches due to SSZ (de)serializing and disk I/O, so prefer them. + beacon_state_data_cache_misses.inc() + if (let tmp = dag.db.getStateRoot(bs.blck.root, bs.slot); tmp.isSome()): + return dag.getState(dag.db, tmp.get(), bs.blck, state) + + false + +proc updateStateData*(dag: CandidateChains, state: var StateData, bs: BlockSlot) = + ## Rewind or advance state such that it matches the given block and slot - + ## this may include replaying from an earlier snapshot if blck is on a + ## different branch or has advanced to a higher slot number than slot + ## If slot is higher than blck.slot, replay will fill in with empty/non-block + ## slots, else it is ignored + + # We need to check the slot because the state might have moved forwards + # without blocks + if state.blck.root == bs.blck.root and state.data.data.slot <= bs.slot: + if state.data.data.slot != bs.slot: + # Might be that we're moving to the same block but later slot + dag.skipAndUpdateState(state.data, bs.blck, bs.slot, true) + + return # State already at the right spot + + if dag.getStateDataCached(state, bs): + return + + let ancestors = rewindState(dag, state, bs) + + # If we come this far, we found the state root. The last block on the stack + # is the one that produced this particular state, so we can pop it + # TODO it might be possible to use the latest block hashes from the state to + # do this more efficiently.. whatever! + + # Time to replay all the blocks between then and now. We skip one because + # it's the one that we found the state with, and it has already been + # applied. Pathologically quadratic in slot number, naïvely. + for i in countdown(ancestors.len - 1, 0): + # Because the ancestors are in the database, there's no need to persist them + # again. Also, because we're applying blocks that were loaded from the + # database, we can skip certain checks that have already been performed + # before adding the block to the database. In particular, this means that + # no state root calculation will take place here, because we can load + # the final state root from the block itself. + let ok = + dag.skipAndUpdateState( + state, ancestors[i], + {skipBlsValidation, skipMerkleValidation, skipStateRootValidation}, + false) + doAssert ok, "Blocks in database should never fail to apply.." + + # We save states here - blocks were guaranteed to have passed through the save + # function once at least, but not so for empty slots! + dag.skipAndUpdateState(state.data, bs.blck, bs.slot, true) + + state.blck = bs.blck + +proc loadTailState*(dag: CandidateChains): StateData = + ## Load the state associated with the current tail in the dag + let stateRoot = dag.db.getBlock(dag.tail.root).get().message.state_root + let found = dag.getState(dag.db, stateRoot, dag.tail, result) + # TODO turn into regular error, this can happen + doAssert found, "Failed to load tail state, database corrupt?" + +proc delState(dag: CandidateChains, bs: BlockSlot) = + # Delete state state and mapping for a particular block+slot + if (let root = dag.db.getStateRoot(bs.blck.root, bs.slot); root.isSome()): + dag.db.delState(root.get()) + +proc updateHead*(dag: CandidateChains, newHead: BlockRef) = + ## Update what we consider to be the current head, as given by the fork + ## choice. + ## The choice of head affects the choice of finalization point - the order + ## of operations naturally becomes important here - after updating the head, + ## blocks that were once considered potential candidates for a tree will + ## now fall from grace, or no longer be considered resolved. + doAssert newHead.parent != nil or newHead.slot == 0 + logScope: pcs = "fork_choice" + + if dag.head.blck == newHead: + info "No head block update", + head = shortLog(newHead), + cat = "fork_choice" + + return + + let + lastHead = dag.head + dag.db.putHeadBlock(newHead.root) + + # Start off by making sure we have the right state + updateStateData( + dag, dag.headState, BlockSlot(blck: newHead, slot: newHead.slot)) + + let + justifiedSlot = dag.headState.data.data + .current_justified_checkpoint + .epoch + .compute_start_slot_at_epoch() + justifiedBS = newHead.atSlot(justifiedSlot) + + dag.head = Head(blck: newHead, justified: justifiedBS) + updateStateData(dag, dag.justifiedState, justifiedBS) + + # TODO isAncestorOf may be expensive - too expensive? + if not lastHead.blck.isAncestorOf(newHead): + info "Updated head block (new parent)", + lastHead = shortLog(lastHead.blck), + headParent = shortLog(newHead.parent), + stateRoot = shortLog(dag.headState.data.root), + headBlock = shortLog(dag.headState.blck), + stateSlot = shortLog(dag.headState.data.data.slot), + justifiedEpoch = shortLog(dag.headState.data.data.current_justified_checkpoint.epoch), + finalizedEpoch = shortLog(dag.headState.data.data.finalized_checkpoint.epoch), + cat = "fork_choice" + + # A reasonable criterion for "reorganizations of the chain" + beacon_reorgs_total.inc() + else: + info "Updated head block", + stateRoot = shortLog(dag.headState.data.root), + headBlock = shortLog(dag.headState.blck), + stateSlot = shortLog(dag.headState.data.data.slot), + justifiedEpoch = shortLog(dag.headState.data.data.current_justified_checkpoint.epoch), + finalizedEpoch = shortLog(dag.headState.data.data.finalized_checkpoint.epoch), + cat = "fork_choice" + + let + finalizedEpochStartSlot = + dag.headState.data.data.finalized_checkpoint.epoch. + compute_start_slot_at_epoch() + # TODO there might not be a block at the epoch boundary - what then? + finalizedHead = newHead.atSlot(finalizedEpochStartSlot) + + doAssert (not finalizedHead.blck.isNil), + "Block graph should always lead to a finalized block" + + if finalizedHead != dag.finalizedHead: + block: # Remove states, walking slot by slot + discard + # TODO this is very aggressive - in theory all our operations start at + # the finalized block so all states before that can be wiped.. + # TODO this is disabled for now because the logic for initializing the + # block dag and potentially a few other places depend on certain + # states (like the tail state) being present. It's also problematic + # because it is not clear what happens when tail and finalized states + # happen on an empty slot.. + # var cur = finalizedHead + # while cur != dag.finalizedHead: + # cur = cur.parent + # dag.delState(cur) + + block: # Clean up block refs, walking block by block + var cur = finalizedHead.blck + while cur != dag.finalizedHead.blck: + # Finalization means that we choose a single chain as the canonical one - + # it also means we're no longer interested in any branches from that chain + # up to the finalization point. + # The new finalized head should not be cleaned! We start at its parent and + # clean everything including the old finalized head. + cur = cur.parent + + # TODO what about attestations? we need to drop those too, though they + # *should* be pretty harmless + if cur.parent != nil: # This happens for the genesis / tail block + for child in cur.parent.children: + if child != cur: + # TODO also remove states associated with the unviable forks! + # TODO the easiest thing to do here would probably be to use + # dag.heads to find unviable heads, then walk those chains + # and remove everything.. currently, if there's a child with + # children of its own, those children will not be pruned + # correctly from the database + dag.blocks.del(child.root) + dag.db.delBlock(child.root) + cur.parent.children = @[cur] + + dag.finalizedHead = finalizedHead + + let hlen = dag.heads.len + for i in 0.. 0, + "We should have at least the genesis block in heaads" + doAssert (not dag.head.blck.isNil()), + "Genesis block will be head, if nothing else" + + # Prefer stability: use justified block from current head to break ties! + result = dag.head.justified + for head in dag.heads[1 ..< ^0]: + if head.justified.slot > result.slot: + result = head.justified + +proc isInitialized*(T: type CandidateChains, db: BeaconChainDB): bool = + let + headBlockRoot = db.getHeadBlock() + tailBlockRoot = db.getTailBlock() + + if not (headBlockRoot.isSome() and tailBlockRoot.isSome()): + return false + + let + headBlock = db.getBlock(headBlockRoot.get()) + tailBlock = db.getBlock(tailBlockRoot.get()) + + if not (headBlock.isSome() and tailBlock.isSome()): + return false + + if not db.containsState(tailBlock.get().message.state_root): + return false + + return true + +proc preInit*( + T: type CandidateChains, db: BeaconChainDB, state: BeaconState, + signedBlock: SignedBeaconBlock) = + # write a genesis state, the way the CandidateChains expects it to be stored in + # database + # TODO probably should just init a blockpool with the freshly written + # state - but there's more refactoring needed to make it nice - doing + # a minimal patch for now.. + let + blockRoot = hash_tree_root(signedBlock.message) + + doAssert signedBlock.message.state_root == hash_tree_root(state) + notice "New database from snapshot", + blockRoot = shortLog(blockRoot), + stateRoot = shortLog(signedBlock.message.state_root), + fork = state.fork, + validators = state.validators.len(), + cat = "initialization" + + db.putState(state) + db.putBlock(signedBlock) + db.putTailBlock(blockRoot) + db.putHeadBlock(blockRoot) + db.putStateRoot(blockRoot, state.slot, signedBlock.message.state_root) + +proc getProposer*(dag: CandidateChains, head: BlockRef, slot: Slot): Option[ValidatorPubKey] = + dag.withState(dag.tmpState, head.atSlot(slot)): + var cache = get_empty_per_epoch_cache() + + # https://github.com/ethereum/eth2.0-specs/blob/v0.11.1/specs/phase0/validator.md#validator-assignments + let proposerIdx = get_beacon_proposer_index(state, cache) + if proposerIdx.isNone: + warn "Missing proposer index", + slot=slot, + epoch=slot.compute_epoch_at_slot, + num_validators=state.validators.len, + active_validators= + get_active_validator_indices(state, slot.compute_epoch_at_slot), + balances=state.balances + return + + return some(state.validators[proposerIdx.get()].pubkey) diff --git a/beacon_chain/block_pools/clearance.nim b/beacon_chain/block_pools/clearance.nim new file mode 100644 index 000000000..391d47c91 --- /dev/null +++ b/beacon_chain/block_pools/clearance.nim @@ -0,0 +1,351 @@ +# beacon_chain +# Copyright (c) 2018-2020 Status Research & Development GmbH +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + chronicles, tables, + metrics, + ../ssz, ../state_transition, ../extras, + ../spec/[crypto, datatypes, digest, helpers], + + block_pools_types, candidate_chains + +# Clearance +# --------------------------------------------- +# +# This module is in charge of making the +# "quarantined" network blocks +# pass the firewall and be stored in the blockpool + +logScope: topics = "clearblk" +{.push raises: [Defect].} + +func getOrResolve*(dag: CandidateChains, quarantine: var Quarantine, root: Eth2Digest): BlockRef = + ## Fetch a block ref, or nil if not found (will be added to list of + ## blocks-to-resolve) + result = dag.getRef(root) + + if result.isNil: + quarantine.missing[root] = MissingBlock(slots: 1) + +proc add*( + dag: var CandidateChains, quarantine: var Quarantine, + blockRoot: Eth2Digest, + signedBlock: SignedBeaconBlock): BlockRef {.gcsafe.} + +proc addResolvedBlock( + dag: var CandidateChains, quarantine: var Quarantine, + state: BeaconState, blockRoot: Eth2Digest, + signedBlock: SignedBeaconBlock, parent: BlockRef): BlockRef = + logScope: pcs = "block_resolution" + doAssert state.slot == signedBlock.message.slot, "state must match block" + + let blockRef = BlockRef.init(blockRoot, signedBlock.message) + link(parent, blockRef) + + dag.blocks[blockRoot] = blockRef + trace "Populating block dag", key = blockRoot, val = blockRef + + # Resolved blocks should be stored in database + dag.putBlock(blockRoot, signedBlock) + + # This block *might* have caused a justification - make sure we stow away + # that information: + let justifiedSlot = + state.current_justified_checkpoint.epoch.compute_start_slot_at_epoch() + + var foundHead: Option[Head] + for head in dag.heads.mitems(): + if head.blck.isAncestorOf(blockRef): + if head.justified.slot != justifiedSlot: + head.justified = blockRef.atSlot(justifiedSlot) + + head.blck = blockRef + + foundHead = some(head) + break + + if foundHead.isNone(): + foundHead = some(Head( + blck: blockRef, + justified: blockRef.atSlot(justifiedSlot))) + dag.heads.add(foundHead.get()) + + info "Block resolved", + blck = shortLog(signedBlock.message), + blockRoot = shortLog(blockRoot), + justifiedHead = foundHead.get().justified, + heads = dag.heads.len(), + cat = "filtering" + + # Now that we have the new block, we should see if any of the previously + # unresolved blocks magically become resolved + # TODO there are more efficient ways of doing this that don't risk + # running out of stack etc + # TODO This code is convoluted because when there are more than ~1.5k + # blocks being synced, there's a stack overflow as `add` gets called + # for the whole chain of blocks. Instead we use this ugly field in `dag` + # which could be avoided by refactoring the code + if not quarantine.inAdd: + quarantine.inAdd = true + defer: quarantine.inAdd = false + var keepGoing = true + while keepGoing: + let retries = quarantine.pending + for k, v in retries: + discard add(dag, quarantine, k, v) + # Keep going for as long as the pending dag is shrinking + # TODO inefficient! so what? + keepGoing = quarantine.pending.len < retries.len + blockRef + +proc add*( + dag: var CandidateChains, quarantine: var Quarantine, + blockRoot: Eth2Digest, + signedBlock: SignedBeaconBlock): BlockRef {.gcsafe.} = + ## return the block, if resolved... + ## the state parameter may be updated to include the given block, if + ## everything checks out + # TODO reevaluate passing the state in like this + let blck = signedBlock.message + doAssert blockRoot == hash_tree_root(blck) + + logScope: pcs = "block_addition" + + # Already seen this block?? + dag.blocks.withValue(blockRoot, blockRef): + debug "Block already exists", + blck = shortLog(blck), + blockRoot = shortLog(blockRoot), + cat = "filtering" + + return blockRef[] + + quarantine.missing.del(blockRoot) + + # If the block we get is older than what we finalized already, we drop it. + # One way this can happen is that we start resolving a block and finalization + # happens in the meantime - the block we requested will then be stale + # by the time it gets here. + if blck.slot <= dag.finalizedHead.slot: + debug "Old block, dropping", + blck = shortLog(blck), + finalizedHead = shortLog(dag.finalizedHead), + tail = shortLog(dag.tail), + blockRoot = shortLog(blockRoot), + cat = "filtering" + + return + + let parent = dag.blocks.getOrDefault(blck.parent_root) + + if parent != nil: + if parent.slot >= blck.slot: + # TODO Malicious block? inform peer dag? + notice "Invalid block slot", + blck = shortLog(blck), + blockRoot = shortLog(blockRoot), + parentBlock = shortLog(parent) + + return + + # The block might have been in either of pending or missing - we don't want + # any more work done on its behalf + quarantine.pending.del(blockRoot) + + # The block is resolved, now it's time to validate it to ensure that the + # blocks we add to the database are clean for the given state + + # TODO if the block is from the future, we should not be resolving it (yet), + # but maybe we should use it as a hint that our clock is wrong? + updateStateData( + dag, dag.tmpState, BlockSlot(blck: parent, slot: blck.slot - 1)) + + let + poolPtr = unsafeAddr dag # safe because restore is short-lived + func restore(v: var HashedBeaconState) = + # TODO address this ugly workaround - there should probably be a + # `state_transition` that takes a `StateData` instead and updates + # the block as well + doAssert v.addr == addr poolPtr.tmpState.data + poolPtr.tmpState = poolPtr.headState + + if not state_transition( + dag.tmpState.data, signedBlock, dag.updateFlags, restore): + # TODO find a better way to log all this block data + notice "Invalid block", + blck = shortLog(blck), + blockRoot = shortLog(blockRoot), + cat = "filtering" + + return + # Careful, tmpState.data has been updated but not blck - we need to create + # the BlockRef first! + dag.tmpState.blck = addResolvedBlock( + dag, quarantine, + dag.tmpState.data.data, blockRoot, signedBlock, parent) + dag.putState(dag.tmpState.data, dag.tmpState.blck) + + return dag.tmpState.blck + + # TODO already checked hash though? main reason to keep this is because + # the pending dag calls this function back later in a loop, so as long + # as dag.add(...) requires a SignedBeaconBlock, easier to keep them in + # pending too. + quarantine.pending[blockRoot] = signedBlock + + # TODO possibly, it makes sense to check the database - that would allow sync + # to simply fill up the database with random blocks the other clients + # think are useful - but, it would also risk filling the database with + # junk that's not part of the block graph + + if blck.parent_root in quarantine.missing or + blck.parent_root in quarantine.pending: + return + + # This is an unresolved block - put its parent on the missing list for now... + # TODO if we receive spam blocks, one heurestic to implement might be to wait + # for a couple of attestations to appear before fetching parents - this + # would help prevent using up network resources for spam - this serves + # two purposes: one is that attestations are likely to appear for the + # block only if it's valid / not spam - the other is that malicious + # validators that are not proposers can sign invalid blocks and send + # them out without penalty - but signing invalid attestations carries + # a risk of being slashed, making attestations a more valuable spam + # filter. + # TODO when we receive the block, we don't know how many others we're missing + # from that branch, so right now, we'll just do a blind guess + let parentSlot = blck.slot - 1 + + quarantine.missing[blck.parent_root] = MissingBlock( + slots: + # The block is at least two slots ahead - try to grab whole history + if parentSlot > dag.head.blck.slot: + parentSlot - dag.head.blck.slot + else: + # It's a sibling block from a branch that we're missing - fetch one + # epoch at a time + max(1.uint64, SLOTS_PER_EPOCH.uint64 - + (parentSlot.uint64 mod SLOTS_PER_EPOCH.uint64)) + ) + + debug "Unresolved block (parent missing)", + blck = shortLog(blck), + blockRoot = shortLog(blockRoot), + pending = quarantine.pending.len, + missing = quarantine.missing.len, + cat = "filtering" + +# https://github.com/ethereum/eth2.0-specs/blob/v0.11.1/specs/phase0/p2p-interface.md#global-topics +proc isValidBeaconBlock*( + dag: CandidateChains, quarantine: var Quarantine, + signed_beacon_block: SignedBeaconBlock, current_slot: Slot, + flags: UpdateFlags): bool = + # In general, checks are ordered from cheap to expensive. Especially, crypto + # verification could be quite a bit more expensive than the rest. This is an + # externally easy-to-invoke function by tossing network packets at the node. + + # The block is not from a future slot + # TODO allow `MAXIMUM_GOSSIP_CLOCK_DISPARITY` leniency, especially towards + # seemingly future slots. + if not (signed_beacon_block.message.slot <= current_slot): + debug "isValidBeaconBlock: block is from a future slot", + signed_beacon_block_message_slot = signed_beacon_block.message.slot, + current_slot = current_slot + return false + + # The block is from a slot greater than the latest finalized slot (with a + # MAXIMUM_GOSSIP_CLOCK_DISPARITY allowance) -- i.e. validate that + # signed_beacon_block.message.slot > + # compute_start_slot_at_epoch(state.finalized_checkpoint.epoch) + if not (signed_beacon_block.message.slot > dag.finalizedHead.slot): + debug "isValidBeaconBlock: block is not from a slot greater than the latest finalized slot" + return false + + # The block is the first block with valid signature received for the proposer + # for the slot, signed_beacon_block.message.slot. + # + # While this condition is similar to the proposer slashing condition at + # https://github.com/ethereum/eth2.0-specs/blob/v0.11.1/specs/phase0/validator.md#proposer-slashing + # it's not identical, and this check does not address slashing: + # + # (1) The beacon blocks must be conflicting, i.e. different, for the same + # slot and proposer. This check also catches identical blocks. + # + # (2) By this point in the function, it's not been checked whether they're + # signed yet. As in general, expensive checks should be deferred, this + # would add complexity not directly relevant this function. + # + # (3) As evidenced by point (1), the similarity in the validation condition + # and slashing condition, while not coincidental, aren't similar enough + # to combine, as one or the other might drift. + # + # (4) Furthermore, this function, as much as possible, simply returns a yes + # or no answer, without modifying other state for p2p network interface + # validation. Complicating this interface, for the sake of sharing only + # couple lines of code, wouldn't be worthwhile. + # + # TODO might check unresolved/orphaned blocks too, and this might not see all + # blocks at a given slot (though, in theory, those get checked elsewhere), or + # adding metrics that count how often these conditions occur. + let + slotBlockRef = getBlockBySlot(dag, signed_beacon_block.message.slot) + + if not slotBlockRef.isNil: + let blck = dag.get(slotBlockRef).data + if blck.message.proposer_index == + signed_beacon_block.message.proposer_index and + blck.message.slot == signed_beacon_block.message.slot and + blck.signature.toRaw() != signed_beacon_block.signature.toRaw(): + debug "isValidBeaconBlock: block isn't first block with valid signature received for the proposer", + signed_beacon_block_message_slot = signed_beacon_block.message.slot, + blckRef = slotBlockRef, + received_block = shortLog(signed_beacon_block.message), + existing_block = shortLog(dag.get(slotBlockRef).data.message) + return false + + # If this block doesn't have a parent we know about, we can't/don't really + # trace it back to a known-good state/checkpoint to verify its prevenance; + # while one could getOrResolve to queue up searching for missing parent it + # might not be the best place. As much as feasible, this function aims for + # answering yes/no, not queuing other action or otherwise altering state. + let parent_ref = dag.getRef(signed_beacon_block.message.parent_root) + if parent_ref.isNil: + # This doesn't mean a block is forever invalid, only that we haven't seen + # its ancestor blocks yet. While that means for now it should be blocked, + # at least, from libp2p propagation, it shouldn't be ignored. TODO, if in + # the future this block moves from pending to being resolved, consider if + # it's worth broadcasting it then. + + # Pending dag gets checked via `CandidateChains.add(...)` later, and relevant + # checks are performed there. In usual paths beacon_node adds blocks via + # CandidateChains.add(...) directly, with no additional validity checks. TODO, + # not specific to this, but by the pending dag keying on the htr of the + # BeaconBlock, not SignedBeaconBlock, opens up certain spoofing attacks. + quarantine.pending[hash_tree_root(signed_beacon_block.message)] = + signed_beacon_block + return false + + # The proposer signature, signed_beacon_block.signature, is valid with + # respect to the proposer_index pubkey. + let bs = + BlockSlot(blck: parent_ref, slot: dag.get(parent_ref).data.message.slot) + dag.withState(dag.tmpState, bs): + let + blockRoot = hash_tree_root(signed_beacon_block.message) + domain = get_domain(dag.headState.data.data, DOMAIN_BEACON_PROPOSER, + compute_epoch_at_slot(signed_beacon_block.message.slot)) + signing_root = compute_signing_root(blockRoot, domain) + proposer_index = signed_beacon_block.message.proposer_index + + if proposer_index >= dag.headState.data.data.validators.len.uint64: + return false + if not blsVerify(dag.headState.data.data.validators[proposer_index].pubkey, + signing_root.data, signed_beacon_block.signature): + debug "isValidBeaconBlock: block failed signature verification" + return false + + true diff --git a/beacon_chain/block_pools/quarantine.nim b/beacon_chain/block_pools/quarantine.nim new file mode 100644 index 000000000..d224d8ed0 --- /dev/null +++ b/beacon_chain/block_pools/quarantine.nim @@ -0,0 +1,38 @@ +# beacon_chain +# Copyright (c) 2018-2020 Status Research & Development GmbH +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + chronicles, tables, + stew/bitops2, + metrics, + ../spec/digest, + + block_pools_types + +logScope: topics = "quarant" +{.push raises: [Defect].} + +func checkMissing*(quarantine: var Quarantine): seq[FetchRecord] = + ## Return a list of blocks that we should try to resolve from other client - + ## to be called periodically but not too often (once per slot?) + var done: seq[Eth2Digest] + + for k, v in quarantine.missing.mpairs(): + if v.tries > 8: + done.add(k) + else: + inc v.tries + + for k in done: + # TODO Need to potentially remove from quarantine.pending - this is currently a + # memory leak here! + quarantine.missing.del(k) + + # simple (simplistic?) exponential backoff for retries.. + for k, v in quarantine.missing.pairs(): + if countOnes(v.tries.uint64) == 1: + result.add(FetchRecord(root: k, historySlots: v.slots))