pre-emptive duplicate validator detection heuristic

2020-10-27 18:21:35 +01:00 · 2020-10-27 18:21:35 +01:00 · a16f5afcd5
parent 55ecb61c3a
commit a16f5afcd5
8 changed files with 132 additions and 9 deletions
--- a/beacon_chain/attestation_pool.nim
+++ b/beacon_chain/attestation_pool.nim
@ -135,18 +135,18 @@ proc updateCurrent(pool: var AttestationPool, wallSlot: Slot) =

 func addToAggregates(pool: var AttestationPool, attestation: Attestation) =
  # do a lookup for the current slot and get it's associated htrs/attestations
-  var aggreated_attestation = pool.attestationAggregates.mgetOrPut(
+  var aggregated_attestation = pool.attestationAggregates.mgetOrPut(
    attestation.data.slot, Table[Eth2Digest, Attestation]()).
    # do a lookup for the same attestation data htr and get the attestation
    mgetOrPut(attestation.data.hash_tree_root, attestation)
  # if the aggregation bits differ (we didn't just insert it into the table)
  # and only if there is no overlap of the signatures ==> aggregate!
-  if not aggreated_attestation.aggregation_bits.overlaps(attestation.aggregation_bits):
+  if not aggregated_attestation.aggregation_bits.overlaps(attestation.aggregation_bits):
    var agg {.noInit.}: AggregateSignature
-    agg.init(aggreated_attestation.signature)
-    aggreated_attestation.aggregation_bits.combine(attestation.aggregation_bits)
+    agg.init(aggregated_attestation.signature)
+    aggregated_attestation.aggregation_bits.combine(attestation.aggregation_bits)
    agg.aggregate(attestation.signature)
-    aggreated_attestation.signature = agg.finish()
+    aggregated_attestation.signature = agg.finish()

 proc addAttestation*(pool: var AttestationPool,
                     attestation: Attestation,
--- a/beacon_chain/conf.nim
+++ b/beacon_chain/conf.nim
@ -51,6 +51,11 @@ type
    enabled = "Always enabled"
    disabled = "Always disabled"

+  GossipSlashingProtectionMode* {.pure.} = enum
+    dontcheck
+    warn
+    stop
+
  BeaconNodeConf* = object
    logLevel* {.
      defaultValue: "INFO"
@ -255,6 +260,12 @@ type
        desc: "Write SSZ dumps of blocks, attestations and states to data dir"
        name: "dump" }: bool

+      gossipSlashingProtection* {.
+        defaultValue: GossipSlashingProtectionMode.warn
+        desc: "[=warn*|stop] What to do when another validator is detected to be running the same validator keys (default `warn`, will become `stop` in the future)"
+        name: "gossip-slashing-protection"
+      }: GossipSlashingProtectionMode
+
    of createTestnet:
      testnetDepositsFile* {.
        desc: "A LaunchPad deposits file for the genesis state validators"
--- a/beacon_chain/eth2_processor.nim
+++ b/beacon_chain/eth2_processor.nim
@ -13,7 +13,7 @@ import
  chronicles, chronos, metrics,
  ./spec/[crypto, datatypes, digest],
  ./block_pools/[clearance, chain_dag],
-  ./attestation_aggregation, ./exit_pool,
+  ./attestation_aggregation, ./exit_pool, ./validator_pool,
  ./beacon_node_types, ./attestation_pool,
  ./time, ./conf, ./sszdump

@ -31,6 +31,9 @@ declareCounter beacon_proposer_slashings_received,
 declareCounter beacon_voluntary_exits_received,
  "Number of beacon chain voluntary exits received by this peer"

+declareCounter beacon_duplicate_validator_protection_activated,
+  "Number of times duplicate validator protection was activated"
+
 const delayBuckets = [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, Inf]

 declareHistogram beacon_attestation_delay,
@ -67,6 +70,7 @@ type
    chainDag*: ChainDAGRef
    attestationPool*: ref AttestationPool
    exitPool: ref ExitPool
+    validatorPool: ref ValidatorPool
    quarantine*: QuarantineRef
    blockReceivedDuringSlot*: Future[void]

@ -74,6 +78,8 @@ type
    attestationsQueue*: AsyncQueue[AttestationEntry]
    aggregatesQueue*: AsyncQueue[AggregateEntry]

+    gossipSlashingProtection*: DupProtection
+
 proc updateHead*(self: var Eth2Processor, wallSlot: Slot) =
  ## Trigger fork choice and returns the new head block.
  ## Can return `nil`
@ -298,6 +304,42 @@ proc blockValidator*(

 {.push raises: [Defect].}

+proc checkForPotentialSelfSlashing(
+    self: var Eth2Processor, attestationData: AttestationData,
+    attesterIndices: HashSet[ValidatorIndex], wallSlot: Slot) =
+  # Attestations remain valid for 32 slots, so avoid confusing with one's own
+  # reflections, for a ATTESTATION_PROPAGATION_SLOT_RANGE div SLOTS_PER_EPOCH
+  # period after the attestation slot. For mainnet this can be one additional
+  # epoch, and for minimal, four epochs. Unlike in the attestation validation
+  # checks, use the spec version of the constant here.
+  const
+    # https://github.com/ethereum/eth2.0-specs/blob/v1.0.0/specs/phase0/p2p-interface.md#configuration
+    ATTESTATION_PROPAGATION_SLOT_RANGE = 32
+
+    GUARD_EPOCHS = ATTESTATION_PROPAGATION_SLOT_RANGE div SLOTS_PER_EPOCH
+
+  # If gossipSlashingProtection not dontcheck or stop, it's the default "warn".
+  let epoch = wallSlot.epoch
+  if  epoch < self.gossipSlashingProtection.broadcastStartEpoch and
+      epoch >= self.gossipSlashingProtection.probeEpoch and
+      epoch <= self.gossipSlashingProtection.probeEpoch + GUARD_EPOCHS:
+    let tgtBlck = self.chainDag.getRef(attestationData.target.root)
+    doAssert not tgtBlck.isNil  # because attestation is valid above
+
+    let epochRef = self.chainDag.getEpochRef(
+      tgtBlck, attestationData.target.epoch)
+    for validatorIndex in attesterIndices:
+      let validatorPubkey = epochRef.validator_keys[validatorIndex]
+      if self.validatorPool[].getValidator(validatorPubkey) !=
+          default(AttachedValidator):
+        warn "Duplicate validator detected; would be slashed",
+          validatorIndex,
+          validatorPubkey
+        beacon_duplicate_validator_protection_activated.inc()
+        if self.config.gossipSlashingProtection == GossipSlashingProtectionMode.stop:
+          warn "We believe you are currently running another instance of the same validator. We've disconnected you from the network as this presents a significant slashing risk. Possible next steps are (a) making sure you've disconnected your validator from your old machine before restarting the client; and (b) running the client again with the gossip-slashing-protection option disabled, only if you are absolutely sure this is the only instance of your validator running, and reporting the issue at https://github.com/status-im/nimbus-eth2/issues."
+          quit QuitFailure
+
 proc attestationValidator*(
    self: var Eth2Processor,
    attestation: Attestation,
@ -329,6 +371,8 @@ proc attestationValidator*(
  beacon_attestations_received.inc()
  beacon_attestation_delay.observe(delay.toFloatSeconds())

+  self.checkForPotentialSelfSlashing(attestation.data, v.value, wallSlot)
+
  while self.attestationsQueue.full():
    try:
      notice "Queue full, dropping attestation",
@ -381,6 +425,9 @@ proc aggregateValidator*(
  beacon_aggregates_received.inc()
  beacon_aggregate_delay.observe(delay.toFloatSeconds())

+  self.checkForPotentialSelfSlashing(
+    signedAggregateAndProof.message.aggregate.data, v.value, wallSlot)
+
  while self.aggregatesQueue.full():
    try:
      notice "Queue full, dropping aggregate",
@ -500,6 +547,7 @@ proc new*(T: type Eth2Processor,
          chainDag: ChainDAGRef,
          attestationPool: ref AttestationPool,
          exitPool: ref ExitPool,
+          validatorPool: ref ValidatorPool,
          quarantine: QuarantineRef,
          getWallTime: GetWallTimeFn): ref Eth2Processor =
  (ref Eth2Processor)(
@ -508,6 +556,7 @@ proc new*(T: type Eth2Processor,
    chainDag: chainDag,
    attestationPool: attestationPool,
    exitPool: exitPool,
+    validatorPool: validatorPool,
    quarantine: quarantine,
    blockReceivedDuringSlot: newFuture[void](),
    blocksQueue: newAsyncQueue[BlockEntry](1),
--- a/beacon_chain/nimbus_beacon_node.nim
+++ b/beacon_chain/nimbus_beacon_node.nim
@ -322,7 +322,8 @@ proc init*(T: type BeaconNode,
  proc getWallTime(): BeaconTime = res.beaconClock.now()

  res.processor = Eth2Processor.new(
-    conf, chainDag, attestationPool, exitPool, quarantine, getWallTime)
+    conf, chainDag, attestationPool, exitPool, newClone(res.attachedValidators),
+    quarantine, getWallTime)

  res.requestManager = RequestManager.init(
    network, res.processor.blocksQueue)
@ -627,6 +628,45 @@ proc removeMessageHandlers(node: BeaconNode) =
  for subnet in 0'u64 ..< ATTESTATION_SUBNET_COUNT:
    node.network.unsubscribe(getAttestationTopic(node.forkDigest, subnet))

+proc setupSelfSlashingProtection(node: BeaconNode, slot: Slot) =
+  # When another client's already running, this is very likely to detect
+  # potential duplicate validators, which can trigger slashing. Assuming
+  # the most pessimal case of two validators started simultaneously, the
+  # probability of triggering a slashable condition is up to 1/n, with n
+  # being the number of epochs one waits before proposing or attesting.
+  #
+  # Every missed attestation costs approximately 3*get_base_reward(), which
+  # can be up to around 10,000 Wei. Thus, skipping attestations isn't cheap
+  # and one should gauge the likelihood of this simultaneous launch to tune
+  # the epoch delay to one's perceived risk.
+  #
+  # This approach catches both startup and network outage conditions.
+
+  const duplicateValidatorEpochs = 2
+
+  node.processor.gossipSlashingProtection.broadcastStartEpoch =
+    slot.epoch + duplicateValidatorEpochs
+  # randomize() already called; also, never probe on first epoch in guard
+  # period, so that existing, running validators can be picked up. Whilst
+  # this reduces entropy for overlapping-start cases, and increases their
+  # collision likelihood, that can be compensated for by increasing guard
+  # epoch periods by 1. As a corollary, 1 guard epoch won't detect when a
+  # duplicate pair overlaps exactly, only the running/starting case. Even
+  # 2 epochs is dangerous because it'll guarantee colliding probes in the
+  # overlapping case.
+
+  # So dPE == 2 -> epoch + 1, always; dPE == 3 -> epoch + (1 or 2), etc.
+  node.processor.gossipSlashingProtection.probeEpoch =
+    slot.epoch + 1 + rand(duplicateValidatorEpochs.int - 2).uint64
+  doAssert node.processor.gossipSlashingProtection.probeEpoch <
+    node.processor.gossipSlashingProtection.broadcastStartEpoch
+
+  debug "Setting up self-slashing protection",
+    epoch = slot.epoch,
+    probeEpoch = node.processor.gossipSlashingProtection.probeEpoch,
+    broadcastStartEpoch =
+      node.processor.gossipSlashingProtection.broadcastStartEpoch
+
 proc updateGossipStatus(node: BeaconNode, slot: Slot) =
  # Syncing tends to be ~1 block/s, and allow for an epoch of time for libp2p
  # subscribing to spin up. The faster the sync, the more wallSlot - headSlot
@ -660,6 +700,7 @@ proc updateGossipStatus(node: BeaconNode, slot: Slot) =
      headSlot = node.chainDag.head.slot,
      syncQueueLen

+    node.setupSelfSlashingProtection(slot)
    node.addMessageHandlers()
    doAssert node.getTopicSubscriptionEnabled()
  elif
@ -978,6 +1019,7 @@ proc run*(node: BeaconNode) =
    node.startSyncManager()

    if not node.beaconClock.now().toSlot().afterGenesis:
+      node.setupSelfSlashingProtection(curSlot)
      node.addMessageHandlers()
      doAssert node.getTopicSubscriptionEnabled()

--- a/beacon_chain/spec/datatypes.nim
+++ b/beacon_chain/spec/datatypes.nim
@ -534,6 +534,10 @@ type
    current_justified_checkpoint*: Checkpoint
    finalized_checkpoint*: Checkpoint

+  DupProtection* = object
+    broadcastStartEpoch*: Epoch
+    probeEpoch*: Epoch
+
 func shortValidatorKey*(state: BeaconState, validatorIdx: int): string =
  ($state.validators[validatorIdx].pubkey)[0..7]

--- a/beacon_chain/validator_duties.nim
+++ b/beacon_chain/validator_duties.nim
@ -615,6 +615,21 @@ proc handleValidatorDuties*(node: BeaconNode, lastSlot, slot: Slot) {.async.} =

  var curSlot = lastSlot + 1

+  # The dontcheck option's a deliberately undocumented escape hatch for the
+  # local testnets and similar development and testing use cases.
+  doAssert node.config.gossipSlashingProtection == GossipSlashingProtectionMode.dontcheck or (
+    node.processor[].gossipSlashingProtection.probeEpoch <
+    node.processor[].gossipSlashingProtection.broadcastStartEpoch)
+  if curSlot.epoch <
+        node.processor[].gossipSlashingProtection.broadcastStartEpoch and
+      curSlot.epoch != node.processor[].gossipSlashingProtection.probeEpoch and
+      node.config.gossipSlashingProtection == GossipSlashingProtectionMode.stop:
+    notice "Waiting to gossip out to detect potential duplicate validators",
+      broadcastStartEpoch =
+        node.processor[].gossipSlashingProtection.broadcastStartEpoch,
+      probeEpoch = node.processor[].gossipSlashingProtection.probeEpoch
+    return
+
  # Start by checking if there's work we should have done in the past that we
  # can still meaningfully do
  while curSlot < slot:
--- a/scripts/launch_local_testnet.sh
+++ b/scripts/launch_local_testnet.sh
@ -385,6 +385,7 @@ for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do
    --metrics \
    --metrics-address="127.0.0.1" \
    --metrics-port="$(( BASE_METRICS_PORT + NUM_NODE ))" \
+    --gossip-slashing-protection=dontcheck \
    ${EXTRA_ARGS} \
    > "${DATA_DIR}/log${NUM_NODE}.txt" 2>&1 &

--- a/tests/simulation/run_node.sh
+++ b/tests/simulation/run_node.sh
@ -9,7 +9,7 @@ shift
 # shellcheck source=/dev/null
 source "$(dirname "$0")/vars.sh"

-if [[ ! -z "$1" ]]; then
+if [[ -n "$1" ]]; then
  ADDITIONAL_BEACON_NODE_ARGS=$1
  shift
 else
@ -18,7 +18,7 @@ fi

 BOOTSTRAP_ARG=""

-if [[ ! -z "$1" ]]; then
+if [[ -n "$1" ]]; then
  BOOTSTRAP_NODE_ID=$1
  shift
 else
@ -105,5 +105,6 @@ $BEACON_NODE_BIN \
  --metrics \
  --metrics-address="127.0.0.1" \
  --metrics-port="$(( $BASE_METRICS_PORT + $NODE_ID ))" \
+  --gossip-slashing-protection=dontcheck \
  ${ADDITIONAL_BEACON_NODE_ARGS} \
  "$@"