mirror of
https://github.com/status-im/nimbus-eth2.git
synced 2025-01-22 12:30:08 +00:00
6b60a774e0
A novel optimisation for attestation and sync committee message validation: when batching, we look for signatures of the same message and aggregate these before batch-validating: this results in up to 60% fewer signature verifications on a busy server, leading to a significant reduction in CPU usage. * increase batch size slightly which helps finding more aggregates * add metrics for batch verification efficiency * use simple `blsVerify` when there is only one signature to verify in the batch, avoiding the RNG
465 lines
18 KiB
Nim
465 lines
18 KiB
Nim
# beacon_chain
|
|
# Copyright (c) 2019-2021 Status Research & Development GmbH
|
|
# Licensed and distributed under either of
|
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
|
|
|
{.push raises: [Defect].}
|
|
|
|
import
|
|
std/[deques, sequtils],
|
|
metrics,
|
|
# Status
|
|
chronicles, chronos,
|
|
../spec/signatures_batch,
|
|
../consensus_object_pools/[blockchain_dag, spec_cache]
|
|
|
|
export signatures_batch, blockchain_dag
|
|
|
|
logScope:
|
|
topics = "gossip_checks"
|
|
|
|
declareCounter batch_verification_batches,
|
|
"Total number of batches processed"
|
|
declareCounter batch_verification_signatures,
|
|
"Total number of verified signatures before aggregation"
|
|
declareCounter batch_verification_aggregates,
|
|
"Total number of verified signatures after aggregation"
|
|
|
|
# Batched gossip validation
|
|
# ----------------------------------------------------------------
|
|
# Batching in the context of BLS means collecting the signatures of several
|
|
# messages and verifying them all at once - this can be done more efficiently
|
|
# than verifying each message one by one, but the downside is that we get an
|
|
# all-or-nothing response - in case of an invalid signature, we must re-check
|
|
# each message separately.
|
|
#
|
|
# In addition to batching, we also perform lazy aggregation:
|
|
#
|
|
# * batching speeds up the verification of multiple signatures over different
|
|
# messages, by a decent amount
|
|
# * lazy aggregation speeds up the verification of multiple signatures over the
|
|
# same message, by a lot
|
|
#
|
|
# Due to the nature of gossip validation in eth2, it is common for messages
|
|
# to arrive in bursts - because most traffic on the network is valid (honest
|
|
# nodes don't re-broadcast invalid traffic and dishonest nodes quickly get
|
|
# disconnected), valid messages by far make up the bulk of traffic.
|
|
#
|
|
# Further, traffic is divided into topics - on a single topic it will be
|
|
# highly likely that the same message appears over and over again, but with
|
|
# different signatures, as most validators have the same view of the network -
|
|
# at least 2/3 or we're in deep trouble :)
|
|
|
|
type
|
|
BatchResult* {.pure.} = enum
|
|
Invalid # Invalid by default
|
|
Valid
|
|
Timeout
|
|
|
|
Eager = proc(): bool {.gcsafe, raises: [Defect].} ##\
|
|
## Callback that returns true if eager processing should be done to lower
|
|
## latency at the expense of spending more cycles validating things, creating
|
|
## a crude timesharing priority mechanism.
|
|
|
|
BatchItem* = object
|
|
sigset: SignatureSet
|
|
fut: Future[BatchResult]
|
|
|
|
Batch* = object
|
|
created: Moment
|
|
sigsets: seq[SignatureSet]
|
|
items: seq[BatchItem]
|
|
|
|
BatchCrypto* = object
|
|
# Each batch is bounded by BatchedCryptoSize which was chosen:
|
|
# - based on "nimble bench" in nim-blscurve
|
|
# so that low power devices like Raspberry Pi 4 can process
|
|
# that many batched verifications within ~30ms on average
|
|
# - based on the accumulation rate of attestations and aggregates
|
|
# in large instances which were 12000 per slot (12s)
|
|
# hence 1 per ms (but the pattern is bursty around the 4s mark)
|
|
# The number of batches is bounded by time - batch validation is skipped if
|
|
# we can't process them in the time that one slot takes, and we return
|
|
# timeout instead which prevents the gossip layer from forwarding the
|
|
# batch.
|
|
batches: Deque[ref Batch]
|
|
eager: Eager ##\
|
|
## Eager is used to enable eager processing of attestations when it's
|
|
## prudent to do so (instead of leaving the CPU for other, presumably more
|
|
## important work like block processing)
|
|
##
|
|
verifier: BatchVerifier
|
|
|
|
pruneTime: Moment ## last time we had to prune something
|
|
|
|
# `nim-metrics` library is a bit too slow to update on every batch, so
|
|
# we accumulate here instead
|
|
counts: tuple[signatures, batches, aggregates: int64]
|
|
|
|
const
|
|
# We cap waiting for an idle slot in case there's a lot of network traffic
|
|
# taking up all CPU - we don't want to _completely_ stop processing
|
|
# attestations - doing so also allows us to benefit from more batching /
|
|
# larger network reads when under load.
|
|
BatchAttAccumTime = 10.milliseconds
|
|
|
|
# Threshold for immediate trigger of batch verification.
|
|
# A balance between throughput and worst case latency.
|
|
# At least 6 so that the constant factors
|
|
# (RNG for blinding and Final Exponentiation)
|
|
# are amortized, but not too big as we need to redo checks one-by-one if one
|
|
# failed.
|
|
# The current value is based on experiments, where 72 gives an average batch
|
|
# size of ~30 signatures per batch, or 2.5 signatures per aggregate (meaning
|
|
# an average of 12 verifications per batch which on a raspberry should be
|
|
# doable in less than 30ms). In the same experiment, a value of 36 resulted
|
|
# in 17-18 signatures per batch and 1.7-1.9 signatures per aggregate - this
|
|
# node was running on mainnet with `--subscribe-all-subnets` turned on -
|
|
# typical nodes will see smaller batches.
|
|
BatchedCryptoSize = 72
|
|
|
|
proc new*(
|
|
T: type BatchCrypto, rng: ref BrHmacDrbgContext,
|
|
eager: Eager, taskpool: TaskPoolPtr): ref BatchCrypto =
|
|
(ref BatchCrypto)(
|
|
verifier: BatchVerifier(rng: rng, taskpool: taskpool),
|
|
eager: eager,
|
|
pruneTime: Moment.now())
|
|
|
|
func len(batch: Batch): int =
|
|
batch.items.len()
|
|
|
|
func full(batch: Batch): bool =
|
|
batch.len() >= BatchedCryptoSize
|
|
|
|
proc complete(batchItem: var BatchItem, v: BatchResult) =
|
|
batchItem.fut.complete(v)
|
|
batchItem.fut = nil
|
|
|
|
proc complete(batchItem: var BatchItem, ok: bool) =
|
|
batchItem.fut.complete(if ok: BatchResult.Valid else: BatchResult.Invalid)
|
|
batchItem.fut = nil
|
|
|
|
proc skip(batch: var Batch) =
|
|
for res in batch.items.mitems():
|
|
res.complete(BatchResult.Timeout)
|
|
|
|
proc pruneBatchQueue(batchCrypto: ref BatchCrypto) =
|
|
let
|
|
now = Moment.now()
|
|
|
|
# If batches haven't been processed for more than 12 seconds
|
|
while batchCrypto.batches.len() > 0:
|
|
if batchCrypto.batches[0][].created + SECONDS_PER_SLOT.int64.seconds > now:
|
|
break
|
|
if batchCrypto.pruneTime + SECONDS_PER_SLOT.int64.seconds > now:
|
|
notice "Batch queue pruned, skipping attestation validation",
|
|
batches = batchCrypto.batches.len()
|
|
batchCrypto.pruneTime = Moment.now()
|
|
|
|
batchCrypto.batches.popFirst()[].skip()
|
|
|
|
proc combine(a: var Signature, b: Signature) =
|
|
var tmp = AggregateSignature.init(CookedSig(a))
|
|
tmp.aggregate(b)
|
|
a = Signature(tmp.finish())
|
|
|
|
proc combine(a: var PublicKey, b: PublicKey) =
|
|
var tmp = AggregatePublicKey.init(CookedPubKey(a))
|
|
tmp.aggregate(b)
|
|
a = PublicKey(tmp.finish())
|
|
|
|
proc processBatch(batchCrypto: ref BatchCrypto) =
|
|
## Process one batch, if there is any
|
|
|
|
# Pruning the queue here makes sure we catch up with processing if need be
|
|
batchCrypto.pruneBatchQueue() # Skip old batches
|
|
|
|
if batchCrypto[].batches.len() == 0:
|
|
# No more batches left, they might have been eagerly processed or pruned
|
|
return
|
|
|
|
let
|
|
batch = batchCrypto[].batches.popFirst()
|
|
batchSize = batch[].sigsets.len()
|
|
|
|
if batchSize == 0:
|
|
# Nothing to do in this batch, can happen when a batch is created without
|
|
# there being any signatures successfully added to it
|
|
discard
|
|
else:
|
|
trace "batch crypto - starting",
|
|
batchSize
|
|
|
|
let
|
|
startTick = Moment.now()
|
|
ok =
|
|
if batchSize == 1: blsVerify(batch[].sigsets[0])
|
|
else: batchCrypto.verifier.batchVerify(batch[].sigsets)
|
|
|
|
trace "batch crypto - finished",
|
|
batchSize,
|
|
cryptoVerified = ok,
|
|
batchDur = Moment.now() - startTick
|
|
|
|
if ok:
|
|
for res in batch.items.mitems():
|
|
res.complete(BatchResult.Valid)
|
|
else:
|
|
# Batched verification failed meaning that some of the signature checks
|
|
# failed, but we don't know which ones - check each signature separately
|
|
# instead
|
|
debug "batch crypto - failure, falling back",
|
|
items = batch[].items.len()
|
|
|
|
for item in batch[].items.mitems():
|
|
item.complete(blsVerify item.sigset)
|
|
|
|
batchCrypto[].counts.batches += 1
|
|
batchCrypto[].counts.signatures += batch[].items.len()
|
|
batchCrypto[].counts.aggregates += batch[].sigsets.len()
|
|
|
|
if batchCrypto[].counts.batches >= 256:
|
|
# Not too often, so as not to overwhelm our metrics
|
|
batch_verification_batches.inc(batchCrypto[].counts.batches)
|
|
batch_verification_signatures.inc(batchCrypto[].counts.signatures)
|
|
batch_verification_aggregates.inc(batchCrypto[].counts.aggregates)
|
|
|
|
reset(batchCrypto[].counts)
|
|
|
|
proc deferCryptoProcessing(batchCrypto: ref BatchCrypto) {.async.} =
|
|
## Process pending crypto check after some time has passed - the time is
|
|
## chosen such that there's time to fill the batch but not so long that
|
|
## latency across the network is negatively affected
|
|
await sleepAsync(BatchAttAccumTime)
|
|
|
|
# Take the first batch in the queue and process it - if eager processing has
|
|
# stolen it already, that's fine
|
|
batchCrypto.processBatch()
|
|
|
|
proc getBatch(batchCrypto: ref BatchCrypto): (ref Batch, bool) =
|
|
# Get a batch suitable for attestation processing - in particular, attestation
|
|
# batches might be skipped
|
|
batchCrypto.pruneBatchQueue()
|
|
|
|
if batchCrypto.batches.len() == 0 or
|
|
batchCrypto.batches.peekLast[].full():
|
|
# There are no batches in progress - start a new batch and schedule a
|
|
# deferred task to eventually handle it
|
|
let batch = (ref Batch)(created: Moment.now())
|
|
batchCrypto[].batches.addLast(batch)
|
|
(batch, true)
|
|
else:
|
|
let batch = batchCrypto[].batches.peekLast()
|
|
# len will be 0 when the batch was created but nothing added to it
|
|
# because of early failures
|
|
(batch, batch[].len() == 0)
|
|
|
|
proc scheduleBatch(batchCrypto: ref BatchCrypto, fresh: bool) =
|
|
if fresh:
|
|
# Every time we start a new round of batching, we need to launch a deferred
|
|
# task that will compute the result of the batch eventually in case the
|
|
# batch is never filled or eager processing is blocked
|
|
asyncSpawn batchCrypto.deferCryptoProcessing()
|
|
|
|
if batchCrypto.batches.len() > 0 and
|
|
batchCrypto.batches.peekFirst()[].full() and
|
|
batchCrypto.eager():
|
|
# If there's a full batch, process it eagerly assuming the callback allows
|
|
batchCrypto.processBatch()
|
|
|
|
template orReturnErr(v: Option, error: cstring): untyped =
|
|
## Returns with given error string if the option does not have a value
|
|
let tmp = v
|
|
if tmp.isNone:
|
|
return err(error) # this exits the calling scope, as templates are inlined.
|
|
tmp.unsafeGet()
|
|
|
|
template withBatch(
|
|
batchCrypto: ref BatchCrypto, name: cstring,
|
|
body: untyped): Future[BatchResult] =
|
|
block:
|
|
let
|
|
(batch, fresh) = batchCrypto.getBatch()
|
|
|
|
let
|
|
fut = newFuture[BatchResult](name)
|
|
sigset = body
|
|
|
|
var found = false
|
|
# Find existing signature sets with the same message - if we can verify an
|
|
# aggregate instead of several signatures, that is _much_ faster
|
|
for item in batch[].sigsets.mitems():
|
|
if item.message == sigset.message:
|
|
item.signature.combine(sigset.signature)
|
|
item.pubkey.combine(sigset.pubkey)
|
|
found = true
|
|
break
|
|
|
|
if not found:
|
|
batch[].sigsets.add sigset
|
|
|
|
# We need to keep the "original" sigset to allow verifying each signature
|
|
# one by one in the case the combined operation fails
|
|
batch[].items.add(BatchItem(sigset: sigset, fut: fut))
|
|
|
|
batchCrypto.scheduleBatch(fresh)
|
|
fut
|
|
|
|
# See also verify_attestation_signature
|
|
proc scheduleAttestationCheck*(
|
|
batchCrypto: ref BatchCrypto,
|
|
fork: Fork, genesis_validators_root: Eth2Digest,
|
|
attestationData: AttestationData,
|
|
pubkey: CookedPubKey, signature: ValidatorSig
|
|
): Result[tuple[fut: Future[BatchResult], sig: CookedSig], cstring] =
|
|
## Schedule crypto verification of an attestation
|
|
##
|
|
## The buffer is processed:
|
|
## - when eager processing is enabled and the batch is full
|
|
## - otherwise after 10ms (BatchAttAccumTime)
|
|
##
|
|
## This returns an error if crypto sanity checks failed
|
|
## and a future with the deferred attestation check otherwise.
|
|
##
|
|
let
|
|
sig = signature.load().orReturnErr("attestation: cannot load signature")
|
|
fut = batchCrypto.withBatch("batch_validation.scheduleAttestationCheck"):
|
|
attestation_signature_set(
|
|
fork, genesis_validators_root, attestationData, pubkey, sig)
|
|
|
|
ok((fut, sig))
|
|
|
|
proc scheduleAggregateChecks*(
|
|
batchCrypto: ref BatchCrypto,
|
|
fork: Fork, genesis_validators_root: Eth2Digest,
|
|
signedAggregateAndProof: SignedAggregateAndProof,
|
|
epochRef: EpochRef,
|
|
attesting_indices: openArray[ValidatorIndex]
|
|
): Result[tuple[
|
|
aggregatorFut, slotFut, aggregateFut: Future[BatchResult],
|
|
sig: CookedSig], cstring] =
|
|
## Schedule crypto verification of an aggregate
|
|
##
|
|
## This involves 3 checks:
|
|
## - verify_slot_signature
|
|
## - verify_aggregate_and_proof_signature
|
|
## - is_valid_indexed_attestation
|
|
##
|
|
## The buffer is processed:
|
|
## - when eager processing is enabled and the batch is full
|
|
## - otherwise after 10ms (BatchAttAccumTime)
|
|
##
|
|
## This returns None if the signatures could not be loaded.
|
|
## and 3 futures with the deferred aggregate checks otherwise.
|
|
|
|
template aggregate_and_proof: untyped = signedAggregateAndProof.message
|
|
template aggregate: untyped = aggregate_and_proof.aggregate
|
|
|
|
# Do the eager steps first to avoid polluting batches with needlessly
|
|
let
|
|
aggregatorKey =
|
|
epochRef.validatorKey(aggregate_and_proof.aggregator_index).orReturnErr(
|
|
"SignedAggregateAndProof: invalid aggregator index")
|
|
aggregatorSig = signedAggregateAndProof.signature.load().orReturnErr(
|
|
"aggregateAndProof: invalid proof signature")
|
|
slotSig = aggregate_and_proof.selection_proof.load().orReturnErr(
|
|
"aggregateAndProof: invalid selection signature")
|
|
aggregateKey = ? aggregateAll(epochRef.dag, attesting_indices)
|
|
aggregateSig = aggregate.signature.load().orReturnErr(
|
|
"aggregateAndProof: invalid aggregate signature")
|
|
|
|
let
|
|
aggregatorFut = batchCrypto.withBatch("scheduleAggregateChecks.aggregator"):
|
|
aggregate_and_proof_signature_set(
|
|
fork, genesis_validators_root, aggregate_and_proof, aggregatorKey,
|
|
aggregatorSig)
|
|
slotFut = batchCrypto.withBatch("scheduleAggregateChecks.selection_proof"):
|
|
slot_signature_set(
|
|
fork, genesis_validators_root, aggregate.data.slot, aggregatorKey,
|
|
slotSig)
|
|
aggregateFut = batchCrypto.withBatch("scheduleAggregateChecks.aggregate"):
|
|
attestation_signature_set(
|
|
fork, genesis_validators_root, aggregate.data, aggregateKey,
|
|
aggregateSig)
|
|
|
|
ok((aggregatorFut, slotFut, aggregateFut, aggregateSig))
|
|
|
|
proc scheduleSyncCommitteeMessageCheck*(
|
|
batchCrypto: ref BatchCrypto,
|
|
fork: Fork, genesis_validators_root: Eth2Digest,
|
|
slot: Slot, beacon_block_root: Eth2Digest,
|
|
pubkey: CookedPubKey, signature: ValidatorSig
|
|
): Result[tuple[fut: Future[BatchResult], sig: CookedSig], cstring] =
|
|
## Schedule crypto verification of an attestation
|
|
##
|
|
## The buffer is processed:
|
|
## - when eager processing is enabled and the batch is full
|
|
## - otherwise after 10ms (BatchAttAccumTime)
|
|
##
|
|
## This returns an error if crypto sanity checks failed
|
|
## and a future with the deferred attestation check otherwise.
|
|
##
|
|
let
|
|
sig = signature.load().orReturnErr(
|
|
"SyncCommitteMessage: cannot load signature")
|
|
fut = batchCrypto.withBatch("scheduleSyncCommitteeMessageCheck"):
|
|
sync_committee_message_signature_set(
|
|
fork, genesis_validators_root, slot, beacon_block_root, pubkey, sig)
|
|
|
|
ok((fut, sig))
|
|
|
|
proc scheduleContributionChecks*(
|
|
batchCrypto: ref BatchCrypto,
|
|
fork: Fork, genesis_validators_root: Eth2Digest,
|
|
signedContributionAndProof: SignedContributionAndProof,
|
|
subcommitteeIndex: SyncSubcommitteeIndex,
|
|
dag: ChainDAGRef): Result[tuple[
|
|
aggregatorFut, proofFut, contributionFut: Future[BatchResult],
|
|
sig: CookedSig], cstring] =
|
|
## Schedule crypto verification of all signatures in a
|
|
## SignedContributionAndProof message
|
|
##
|
|
## The buffer is processed:
|
|
## - when eager processing is enabled and the batch is full
|
|
## - otherwise after 10ms (BatchAttAccumTime)
|
|
##
|
|
## This returns an error if crypto sanity checks failed
|
|
## and a future with the deferred check otherwise.
|
|
##
|
|
template contribution_and_proof: untyped = signedContributionAndProof.message
|
|
template contribution: untyped = contribution_and_proof.contribution
|
|
|
|
# Do the eager steps first to avoid polluting batches with needlessly
|
|
let
|
|
aggregatorKey =
|
|
dag.validatorKey(contribution_and_proof.aggregator_index).orReturnErr(
|
|
"SignedAggregateAndProof: invalid contributor index")
|
|
aggregatorSig = signedContributionAndProof.signature.load().orReturnErr(
|
|
"SignedContributionAndProof: invalid proof signature")
|
|
proofSig = contribution_and_proof.selection_proof.load().orReturnErr(
|
|
"SignedContributionAndProof: invalid selection signature")
|
|
contributionSig = contribution.signature.load().orReturnErr(
|
|
"SignedContributionAndProof: invalid contribution signature")
|
|
|
|
contributionKey = ? aggregateAll(
|
|
dag, dag.syncCommitteeParticipants(contribution.slot, subcommitteeIndex),
|
|
contribution.aggregation_bits)
|
|
let
|
|
aggregatorFut = batchCrypto.withBatch("scheduleContributionAndProofChecks.aggregator"):
|
|
contribution_and_proof_signature_set(
|
|
fork, genesis_validators_root, contribution_and_proof, aggregatorKey,
|
|
aggregatorSig)
|
|
proofFut = batchCrypto.withBatch("scheduleContributionAndProofChecks.selection_proof"):
|
|
sync_committee_selection_proof_set(
|
|
fork, genesis_validators_root, contribution.slot,
|
|
contribution.subcommittee_index, aggregatorKey, proofSig)
|
|
contributionFut = batchCrypto.withBatch("scheduleContributionAndProofChecks.contribution"):
|
|
sync_committee_message_signature_set(
|
|
fork, genesis_validators_root, contribution.slot,
|
|
contribution.beacon_block_root, contributionKey, contributionSig)
|
|
|
|
ok((aggregatorFut, proofFut, contributionFut, contributionSig))
|