From a16f5afcd5314e6acb057dbff7d2abb886aa9971 Mon Sep 17 00:00:00 2001 From: Dustin Brody Date: Tue, 27 Oct 2020 18:21:35 +0100 Subject: [PATCH] pre-emptive duplicate validator detection heuristic --- beacon_chain/attestation_pool.nim | 10 +++--- beacon_chain/conf.nim | 11 +++++++ beacon_chain/eth2_processor.nim | 51 ++++++++++++++++++++++++++++- beacon_chain/nimbus_beacon_node.nim | 44 ++++++++++++++++++++++++- beacon_chain/spec/datatypes.nim | 4 +++ beacon_chain/validator_duties.nim | 15 +++++++++ scripts/launch_local_testnet.sh | 1 + tests/simulation/run_node.sh | 5 +-- 8 files changed, 132 insertions(+), 9 deletions(-) diff --git a/beacon_chain/attestation_pool.nim b/beacon_chain/attestation_pool.nim index 83e8647ae..c817f3c73 100644 --- a/beacon_chain/attestation_pool.nim +++ b/beacon_chain/attestation_pool.nim @@ -135,18 +135,18 @@ proc updateCurrent(pool: var AttestationPool, wallSlot: Slot) = func addToAggregates(pool: var AttestationPool, attestation: Attestation) = # do a lookup for the current slot and get it's associated htrs/attestations - var aggreated_attestation = pool.attestationAggregates.mgetOrPut( + var aggregated_attestation = pool.attestationAggregates.mgetOrPut( attestation.data.slot, Table[Eth2Digest, Attestation]()). # do a lookup for the same attestation data htr and get the attestation mgetOrPut(attestation.data.hash_tree_root, attestation) # if the aggregation bits differ (we didn't just insert it into the table) # and only if there is no overlap of the signatures ==> aggregate! - if not aggreated_attestation.aggregation_bits.overlaps(attestation.aggregation_bits): + if not aggregated_attestation.aggregation_bits.overlaps(attestation.aggregation_bits): var agg {.noInit.}: AggregateSignature - agg.init(aggreated_attestation.signature) - aggreated_attestation.aggregation_bits.combine(attestation.aggregation_bits) + agg.init(aggregated_attestation.signature) + aggregated_attestation.aggregation_bits.combine(attestation.aggregation_bits) agg.aggregate(attestation.signature) - aggreated_attestation.signature = agg.finish() + aggregated_attestation.signature = agg.finish() proc addAttestation*(pool: var AttestationPool, attestation: Attestation, diff --git a/beacon_chain/conf.nim b/beacon_chain/conf.nim index dc5d4b046..20f3ac291 100644 --- a/beacon_chain/conf.nim +++ b/beacon_chain/conf.nim @@ -51,6 +51,11 @@ type enabled = "Always enabled" disabled = "Always disabled" + GossipSlashingProtectionMode* {.pure.} = enum + dontcheck + warn + stop + BeaconNodeConf* = object logLevel* {. defaultValue: "INFO" @@ -255,6 +260,12 @@ type desc: "Write SSZ dumps of blocks, attestations and states to data dir" name: "dump" }: bool + gossipSlashingProtection* {. + defaultValue: GossipSlashingProtectionMode.warn + desc: "[=warn*|stop] What to do when another validator is detected to be running the same validator keys (default `warn`, will become `stop` in the future)" + name: "gossip-slashing-protection" + }: GossipSlashingProtectionMode + of createTestnet: testnetDepositsFile* {. desc: "A LaunchPad deposits file for the genesis state validators" diff --git a/beacon_chain/eth2_processor.nim b/beacon_chain/eth2_processor.nim index c084400c4..79608c78c 100644 --- a/beacon_chain/eth2_processor.nim +++ b/beacon_chain/eth2_processor.nim @@ -13,7 +13,7 @@ import chronicles, chronos, metrics, ./spec/[crypto, datatypes, digest], ./block_pools/[clearance, chain_dag], - ./attestation_aggregation, ./exit_pool, + ./attestation_aggregation, ./exit_pool, ./validator_pool, ./beacon_node_types, ./attestation_pool, ./time, ./conf, ./sszdump @@ -31,6 +31,9 @@ declareCounter beacon_proposer_slashings_received, declareCounter beacon_voluntary_exits_received, "Number of beacon chain voluntary exits received by this peer" +declareCounter beacon_duplicate_validator_protection_activated, + "Number of times duplicate validator protection was activated" + const delayBuckets = [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, Inf] declareHistogram beacon_attestation_delay, @@ -67,6 +70,7 @@ type chainDag*: ChainDAGRef attestationPool*: ref AttestationPool exitPool: ref ExitPool + validatorPool: ref ValidatorPool quarantine*: QuarantineRef blockReceivedDuringSlot*: Future[void] @@ -74,6 +78,8 @@ type attestationsQueue*: AsyncQueue[AttestationEntry] aggregatesQueue*: AsyncQueue[AggregateEntry] + gossipSlashingProtection*: DupProtection + proc updateHead*(self: var Eth2Processor, wallSlot: Slot) = ## Trigger fork choice and returns the new head block. ## Can return `nil` @@ -298,6 +304,42 @@ proc blockValidator*( {.push raises: [Defect].} +proc checkForPotentialSelfSlashing( + self: var Eth2Processor, attestationData: AttestationData, + attesterIndices: HashSet[ValidatorIndex], wallSlot: Slot) = + # Attestations remain valid for 32 slots, so avoid confusing with one's own + # reflections, for a ATTESTATION_PROPAGATION_SLOT_RANGE div SLOTS_PER_EPOCH + # period after the attestation slot. For mainnet this can be one additional + # epoch, and for minimal, four epochs. Unlike in the attestation validation + # checks, use the spec version of the constant here. + const + # https://github.com/ethereum/eth2.0-specs/blob/v1.0.0/specs/phase0/p2p-interface.md#configuration + ATTESTATION_PROPAGATION_SLOT_RANGE = 32 + + GUARD_EPOCHS = ATTESTATION_PROPAGATION_SLOT_RANGE div SLOTS_PER_EPOCH + + # If gossipSlashingProtection not dontcheck or stop, it's the default "warn". + let epoch = wallSlot.epoch + if epoch < self.gossipSlashingProtection.broadcastStartEpoch and + epoch >= self.gossipSlashingProtection.probeEpoch and + epoch <= self.gossipSlashingProtection.probeEpoch + GUARD_EPOCHS: + let tgtBlck = self.chainDag.getRef(attestationData.target.root) + doAssert not tgtBlck.isNil # because attestation is valid above + + let epochRef = self.chainDag.getEpochRef( + tgtBlck, attestationData.target.epoch) + for validatorIndex in attesterIndices: + let validatorPubkey = epochRef.validator_keys[validatorIndex] + if self.validatorPool[].getValidator(validatorPubkey) != + default(AttachedValidator): + warn "Duplicate validator detected; would be slashed", + validatorIndex, + validatorPubkey + beacon_duplicate_validator_protection_activated.inc() + if self.config.gossipSlashingProtection == GossipSlashingProtectionMode.stop: + warn "We believe you are currently running another instance of the same validator. We've disconnected you from the network as this presents a significant slashing risk. Possible next steps are (a) making sure you've disconnected your validator from your old machine before restarting the client; and (b) running the client again with the gossip-slashing-protection option disabled, only if you are absolutely sure this is the only instance of your validator running, and reporting the issue at https://github.com/status-im/nimbus-eth2/issues." + quit QuitFailure + proc attestationValidator*( self: var Eth2Processor, attestation: Attestation, @@ -329,6 +371,8 @@ proc attestationValidator*( beacon_attestations_received.inc() beacon_attestation_delay.observe(delay.toFloatSeconds()) + self.checkForPotentialSelfSlashing(attestation.data, v.value, wallSlot) + while self.attestationsQueue.full(): try: notice "Queue full, dropping attestation", @@ -381,6 +425,9 @@ proc aggregateValidator*( beacon_aggregates_received.inc() beacon_aggregate_delay.observe(delay.toFloatSeconds()) + self.checkForPotentialSelfSlashing( + signedAggregateAndProof.message.aggregate.data, v.value, wallSlot) + while self.aggregatesQueue.full(): try: notice "Queue full, dropping aggregate", @@ -500,6 +547,7 @@ proc new*(T: type Eth2Processor, chainDag: ChainDAGRef, attestationPool: ref AttestationPool, exitPool: ref ExitPool, + validatorPool: ref ValidatorPool, quarantine: QuarantineRef, getWallTime: GetWallTimeFn): ref Eth2Processor = (ref Eth2Processor)( @@ -508,6 +556,7 @@ proc new*(T: type Eth2Processor, chainDag: chainDag, attestationPool: attestationPool, exitPool: exitPool, + validatorPool: validatorPool, quarantine: quarantine, blockReceivedDuringSlot: newFuture[void](), blocksQueue: newAsyncQueue[BlockEntry](1), diff --git a/beacon_chain/nimbus_beacon_node.nim b/beacon_chain/nimbus_beacon_node.nim index c697b8b20..df1a937eb 100644 --- a/beacon_chain/nimbus_beacon_node.nim +++ b/beacon_chain/nimbus_beacon_node.nim @@ -322,7 +322,8 @@ proc init*(T: type BeaconNode, proc getWallTime(): BeaconTime = res.beaconClock.now() res.processor = Eth2Processor.new( - conf, chainDag, attestationPool, exitPool, quarantine, getWallTime) + conf, chainDag, attestationPool, exitPool, newClone(res.attachedValidators), + quarantine, getWallTime) res.requestManager = RequestManager.init( network, res.processor.blocksQueue) @@ -627,6 +628,45 @@ proc removeMessageHandlers(node: BeaconNode) = for subnet in 0'u64 ..< ATTESTATION_SUBNET_COUNT: node.network.unsubscribe(getAttestationTopic(node.forkDigest, subnet)) +proc setupSelfSlashingProtection(node: BeaconNode, slot: Slot) = + # When another client's already running, this is very likely to detect + # potential duplicate validators, which can trigger slashing. Assuming + # the most pessimal case of two validators started simultaneously, the + # probability of triggering a slashable condition is up to 1/n, with n + # being the number of epochs one waits before proposing or attesting. + # + # Every missed attestation costs approximately 3*get_base_reward(), which + # can be up to around 10,000 Wei. Thus, skipping attestations isn't cheap + # and one should gauge the likelihood of this simultaneous launch to tune + # the epoch delay to one's perceived risk. + # + # This approach catches both startup and network outage conditions. + + const duplicateValidatorEpochs = 2 + + node.processor.gossipSlashingProtection.broadcastStartEpoch = + slot.epoch + duplicateValidatorEpochs + # randomize() already called; also, never probe on first epoch in guard + # period, so that existing, running validators can be picked up. Whilst + # this reduces entropy for overlapping-start cases, and increases their + # collision likelihood, that can be compensated for by increasing guard + # epoch periods by 1. As a corollary, 1 guard epoch won't detect when a + # duplicate pair overlaps exactly, only the running/starting case. Even + # 2 epochs is dangerous because it'll guarantee colliding probes in the + # overlapping case. + + # So dPE == 2 -> epoch + 1, always; dPE == 3 -> epoch + (1 or 2), etc. + node.processor.gossipSlashingProtection.probeEpoch = + slot.epoch + 1 + rand(duplicateValidatorEpochs.int - 2).uint64 + doAssert node.processor.gossipSlashingProtection.probeEpoch < + node.processor.gossipSlashingProtection.broadcastStartEpoch + + debug "Setting up self-slashing protection", + epoch = slot.epoch, + probeEpoch = node.processor.gossipSlashingProtection.probeEpoch, + broadcastStartEpoch = + node.processor.gossipSlashingProtection.broadcastStartEpoch + proc updateGossipStatus(node: BeaconNode, slot: Slot) = # Syncing tends to be ~1 block/s, and allow for an epoch of time for libp2p # subscribing to spin up. The faster the sync, the more wallSlot - headSlot @@ -660,6 +700,7 @@ proc updateGossipStatus(node: BeaconNode, slot: Slot) = headSlot = node.chainDag.head.slot, syncQueueLen + node.setupSelfSlashingProtection(slot) node.addMessageHandlers() doAssert node.getTopicSubscriptionEnabled() elif @@ -978,6 +1019,7 @@ proc run*(node: BeaconNode) = node.startSyncManager() if not node.beaconClock.now().toSlot().afterGenesis: + node.setupSelfSlashingProtection(curSlot) node.addMessageHandlers() doAssert node.getTopicSubscriptionEnabled() diff --git a/beacon_chain/spec/datatypes.nim b/beacon_chain/spec/datatypes.nim index 609a46b03..1f037c408 100644 --- a/beacon_chain/spec/datatypes.nim +++ b/beacon_chain/spec/datatypes.nim @@ -534,6 +534,10 @@ type current_justified_checkpoint*: Checkpoint finalized_checkpoint*: Checkpoint + DupProtection* = object + broadcastStartEpoch*: Epoch + probeEpoch*: Epoch + func shortValidatorKey*(state: BeaconState, validatorIdx: int): string = ($state.validators[validatorIdx].pubkey)[0..7] diff --git a/beacon_chain/validator_duties.nim b/beacon_chain/validator_duties.nim index 0d2d31bf6..e54863899 100644 --- a/beacon_chain/validator_duties.nim +++ b/beacon_chain/validator_duties.nim @@ -615,6 +615,21 @@ proc handleValidatorDuties*(node: BeaconNode, lastSlot, slot: Slot) {.async.} = var curSlot = lastSlot + 1 + # The dontcheck option's a deliberately undocumented escape hatch for the + # local testnets and similar development and testing use cases. + doAssert node.config.gossipSlashingProtection == GossipSlashingProtectionMode.dontcheck or ( + node.processor[].gossipSlashingProtection.probeEpoch < + node.processor[].gossipSlashingProtection.broadcastStartEpoch) + if curSlot.epoch < + node.processor[].gossipSlashingProtection.broadcastStartEpoch and + curSlot.epoch != node.processor[].gossipSlashingProtection.probeEpoch and + node.config.gossipSlashingProtection == GossipSlashingProtectionMode.stop: + notice "Waiting to gossip out to detect potential duplicate validators", + broadcastStartEpoch = + node.processor[].gossipSlashingProtection.broadcastStartEpoch, + probeEpoch = node.processor[].gossipSlashingProtection.probeEpoch + return + # Start by checking if there's work we should have done in the past that we # can still meaningfully do while curSlot < slot: diff --git a/scripts/launch_local_testnet.sh b/scripts/launch_local_testnet.sh index 6e72e4665..51986a51e 100755 --- a/scripts/launch_local_testnet.sh +++ b/scripts/launch_local_testnet.sh @@ -385,6 +385,7 @@ for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do --metrics \ --metrics-address="127.0.0.1" \ --metrics-port="$(( BASE_METRICS_PORT + NUM_NODE ))" \ + --gossip-slashing-protection=dontcheck \ ${EXTRA_ARGS} \ > "${DATA_DIR}/log${NUM_NODE}.txt" 2>&1 & diff --git a/tests/simulation/run_node.sh b/tests/simulation/run_node.sh index 2f1978901..f38283378 100755 --- a/tests/simulation/run_node.sh +++ b/tests/simulation/run_node.sh @@ -9,7 +9,7 @@ shift # shellcheck source=/dev/null source "$(dirname "$0")/vars.sh" -if [[ ! -z "$1" ]]; then +if [[ -n "$1" ]]; then ADDITIONAL_BEACON_NODE_ARGS=$1 shift else @@ -18,7 +18,7 @@ fi BOOTSTRAP_ARG="" -if [[ ! -z "$1" ]]; then +if [[ -n "$1" ]]; then BOOTSTRAP_NODE_ID=$1 shift else @@ -105,5 +105,6 @@ $BEACON_NODE_BIN \ --metrics \ --metrics-address="127.0.0.1" \ --metrics-port="$(( $BASE_METRICS_PORT + $NODE_ID ))" \ + --gossip-slashing-protection=dontcheck \ ${ADDITIONAL_BEACON_NODE_ARGS} \ "$@"