nimbus-eth2/beacon_chain/sync/sync_queue.nim

# beacon_chain
# Copyright (c) 2018-2022 Status Research & Development GmbH
# Licensed and distributed under either of
#   * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT).
#   * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

{.push raises: [Defect].}

import std/[options, heapqueue, tables, strutils, sequtils, math, algorithm]
import stew/[results, base10], chronos, chronicles
import
  ../spec/datatypes/[base, phase0, altair],
  ../spec/[helpers, forks],
  ../networking/[peer_pool, eth2_network],
  ../gossip_processing/block_processor,
  ../consensus_object_pools/block_pools_types

export base, phase0, altair, merge, chronos, chronicles, results,
       block_pools_types, helpers

logScope:
  topics = "syncqueue"

type
  GetSlotCallback* = proc(): Slot {.gcsafe, raises: [Defect].}
  ProcessingCallback* = proc() {.gcsafe, raises: [Defect].}
  BlockVerifier* =
    proc(signedBlock: ForkedSignedBeaconBlock):
      Future[Result[void, BlockError]] {.gcsafe, raises: [Defect].}

  SyncQueueKind* {.pure.} = enum
    Forward, Backward

  SyncRequest*[T] = object
    kind*: SyncQueueKind
    index*: uint64
    slot*: Slot
    count*: uint64
    step*: uint64
    item*: T

  SyncResult*[T] = object
    request*: SyncRequest[T]
    data*: seq[ref ForkedSignedBeaconBlock]

  SyncWaiter* = ref object
    future: Future[void]
    reset: bool

  RewindPoint = object
    failSlot: Slot
    epochCount: uint64

  SyncQueue*[T] = ref object
    kind*: SyncQueueKind
    inpSlot*: Slot
    outSlot*: Slot
    startSlot*: Slot
    finalSlot*: Slot
    chunkSize*: uint64
    queueSize*: int
    counter*: uint64
    pending*: Table[uint64, SyncRequest[T]]
    waiters: seq[SyncWaiter]
    getSafeSlot*: GetSlotCallback
    debtsQueue: HeapQueue[SyncRequest[T]]
    debtsCount: uint64
    readyQueue: HeapQueue[SyncResult[T]]
    rewind: Option[RewindPoint]
    blockVerifier: BlockVerifier
    ident*: string

  SyncManagerError* = object of CatchableError
  BeaconBlocksRes* = NetRes[seq[ref ForkedSignedBeaconBlock]]

chronicles.formatIt SyncQueueKind: toLowerAscii($it)

template shortLog*[T](req: SyncRequest[T]): string =
  Base10.toString(uint64(req.slot)) & ":" &
  Base10.toString(req.count) & "@" &
  Base10.toString(req.index)

chronicles.expandIt SyncRequest:
  `it` = shortLog(it)
  peer = shortLog(it.item)
  direction = toLowerAscii($it.kind)

proc getShortMap*[T](req: SyncRequest[T],
                     data: openArray[ref ForkedSignedBeaconBlock]): string =
  ## Returns all slot numbers in ``data`` as placement map.
  var res = newStringOfCap(req.count)
  var slider = req.slot
  var last = 0
  for i in 0 ..< req.count:
    if last < len(data):
      for k in last ..< len(data):
        if slider == data[k][].slot:
          res.add('x')
          last = k + 1
          break
        elif slider < data[k][].slot:
          res.add('.')
          break
    else:
      res.add('.')
    slider = slider + req.step
  res

proc contains*[T](req: SyncRequest[T], slot: Slot): bool {.inline.} =
  slot >= req.slot and slot < req.slot + req.count * req.step and
    ((slot - req.slot) mod req.step == 0)

proc cmp*[T](a, b: SyncRequest[T]): int =
  cmp(uint64(a.slot), uint64(b.slot))

proc checkResponse*[T](req: SyncRequest[T],
                       data: openArray[ref ForkedSignedBeaconBlock]): bool =
  if len(data) == 0:
    # Impossible to verify empty response.
    return true

  if uint64(len(data)) > req.count:
    # Number of blocks in response should be less or equal to number of
    # requested blocks.
    return false

  var slot = req.slot
  var rindex = 0'u64
  var dindex = 0

  while (rindex < req.count) and (dindex < len(data)):
    if slot < data[dindex][].slot:
      discard
    elif slot == data[dindex][].slot:
      inc(dindex)
    else:
      return false
    slot = slot + req.step
    rindex = rindex + 1'u64

  if dindex == len(data):
    return true
  else:
    return false

proc getFullMap*[T](req: SyncRequest[T],
                    data: openArray[ForkedSignedBeaconBlock]): string =
  # Returns all slot numbers in ``data`` as comma-delimeted string.
  mapIt(data, $it.message.slot).join(", ")

proc init[T](t1: typedesc[SyncRequest], kind: SyncQueueKind, start: Slot,
             finish: Slot, t2: typedesc[T]): SyncRequest[T] =
  let count = finish - start + 1'u64
  SyncRequest[T](kind: kind, slot: start, count: count, step: 1'u64)

proc init[T](t1: typedesc[SyncRequest], kind: SyncQueueKind, slot: Slot,
             count: uint64, item: T): SyncRequest[T] =
  SyncRequest[T](kind: kind, slot: slot, count: count, item: item, step: 1'u64)

proc init[T](t1: typedesc[SyncRequest], kind: SyncQueueKind, start: Slot,
             finish: Slot, item: T): SyncRequest[T] =
  let count = finish - start + 1'u64
  SyncRequest[T](kind: kind, slot: start, count: count, step: 1'u64, item: item)

proc empty*[T](t: typedesc[SyncRequest], kind: SyncQueueKind,
               t2: typedesc[T]): SyncRequest[T] {.inline.} =
  SyncRequest[T](kind: kind, step: 0'u64, count: 0'u64)

proc setItem*[T](sr: var SyncRequest[T], item: T) =
  sr.item = item

proc isEmpty*[T](sr: SyncRequest[T]): bool {.inline.} =
  (sr.step == 0'u64) and (sr.count == 0'u64)

proc init*[T](t1: typedesc[SyncQueue], t2: typedesc[T],
              queueKind: SyncQueueKind,
              start, final: Slot, chunkSize: uint64,
              getSafeSlotCb: GetSlotCallback,
              blockVerifier: BlockVerifier,
              syncQueueSize: int = -1,
              ident: string = "main"): SyncQueue[T] =
  ## Create new synchronization queue with parameters
  ##
  ## ``start`` and ``last`` are starting and finishing Slots.
  ##
  ## ``chunkSize`` maximum number of slots in one request.
  ##
  ## ``syncQueueSize`` maximum queue size for incoming data.
  ## If ``syncQueueSize > 0`` queue will help to keep backpressure under
  ## control. If ``syncQueueSize <= 0`` then queue size is unlimited (default).

  # SyncQueue is the core of sync manager, this data structure distributes
  # requests to peers and manages responses from peers.
  #
  # Because SyncQueue is async data structure it manages backpressure and
  # order of incoming responses and it also resolves "joker's" problem.
  #
  # Joker's problem
  #
  # According to current Ethereum2 network specification
  # > Clients MUST respond with at least one block, if they have it and it
  # > exists in the range. Clients MAY limit the number of blocks in the
  # > response.
  #
  # Such rule can lead to very uncertain responses, for example let slots from
  # 10 to 12 will be not empty. Client which follows specification can answer
  # with any response from this list (X - block, `-` empty space):
  #
  # 1.   X X X
  # 2.   - - X
  # 3.   - X -
  # 4.   - X X
  # 5.   X - -
  # 6.   X - X
  # 7.   X X -
  #
  # If peer answers with `1` everything will be fine and `block_pool` will be
  # able to process all 3 blocks. In case of `2`, `3`, `4`, `6` - `block_pool`
  # will fail immediately with chunk and report "parent is missing" error.
  # But in case of `5` and `7` blocks will be processed by `block_pool` without
  # any problems, however it will start producing problems right from this
  # uncertain last slot. SyncQueue will start producing requests for next
  # blocks, but all the responses from this point will fail with "parent is
  # missing" error. Lets call such peers "jokers", because they are joking
  # with responses.
  #
  # To fix "joker" problem we going to perform rollback to the latest finalized
  # epoch's first slot.
  doAssert(chunkSize > 0'u64, "Chunk size should not be zero")
  SyncQueue[T](
    kind: queueKind,
    startSlot: start,
    finalSlot: final,
    chunkSize: chunkSize,
    queueSize: syncQueueSize,
    getSafeSlot: getSafeSlotCb,
    waiters: newSeq[SyncWaiter](),
    counter: 1'u64,
    pending: initTable[uint64, SyncRequest[T]](),
    debtsQueue: initHeapQueue[SyncRequest[T]](),
    inpSlot: start,
    outSlot: start,
    blockVerifier: blockVerifier,
    ident: ident
  )

proc `<`*[T](a, b: SyncRequest[T]): bool =
  doAssert(a.kind == b.kind)
  case a.kind
  of SyncQueueKind.Forward:
    a.slot < b.slot
  of SyncQueueKind.Backward:
    a.slot > b.slot

proc `<`*[T](a, b: SyncResult[T]): bool =
  doAssert(a.request.kind == b.request.kind)
  case a.request.kind
  of SyncQueueKind.Forward:
    a.request.slot < b.request.slot
  of SyncQueueKind.Backward:
    a.request.slot > b.request.slot

proc `==`*[T](a, b: SyncRequest[T]): bool =
  (a.kind == b.kind) and (a.slot == b.slot) and (a.count == b.count) and
    (a.step == b.step)

proc lastSlot*[T](req: SyncRequest[T]): Slot =
  ## Returns last slot for request ``req``.
  req.slot + req.count - 1'u64

proc makePending*[T](sq: SyncQueue[T], req: var SyncRequest[T]) =
  req.index = sq.counter
  sq.counter = sq.counter + 1'u64
  sq.pending[req.index] = req

proc updateLastSlot*[T](sq: SyncQueue[T], last: Slot) {.inline.} =
  ## Update last slot stored in queue ``sq`` with value ``last``.
  case sq.kind
  of SyncQueueKind.Forward:
    doAssert(sq.finalSlot <= last,
             "Last slot could not be lower then stored one " &
             $sq.finalSlot & " <= " & $last)
    sq.finalSlot = last
  of SyncQueueKind.Backward:
    doAssert(sq.finalSlot >= last,
             "Last slot could not be higher then stored one " &
             $sq.finalSlot & " >= " & $last)
    sq.finalSlot = last

proc wakeupWaiters[T](sq: SyncQueue[T], reset = false) =
  ## Wakeup one or all blocked waiters.
  for item in sq.waiters:
    if reset:
      item.reset = true

    if not(item.future.finished()):
      item.future.complete()

proc waitForChanges[T](sq: SyncQueue[T]): Future[bool] {.async.} =
  ## Create new waiter and wait for completion from `wakeupWaiters()`.
  var waitfut = newFuture[void]("SyncQueue.waitForChanges")
  let waititem = SyncWaiter(future: waitfut)
  sq.waiters.add(waititem)
  try:
    await waitfut
    return waititem.reset
  finally:
    sq.waiters.delete(sq.waiters.find(waititem))

proc wakeupAndWaitWaiters[T](sq: SyncQueue[T]) {.async.} =
  ## This procedure will perform wakeupWaiters(false) and blocks until last
  ## waiter will be awakened.
  var waitChanges = sq.waitForChanges()
  sq.wakeupWaiters(true)
  discard await waitChanges

proc clearAndWakeup*[T](sq: SyncQueue[T]) =
  sq.pending.clear()
  sq.wakeupWaiters(true)

proc resetWait*[T](sq: SyncQueue[T], toSlot: Option[Slot]) {.async.} =
  ## Perform reset of all the blocked waiters in SyncQueue.
  ##
  ## We adding one more waiter to the waiters sequence and
  ## call wakeupWaiters(true). Because our waiter is last in sequence of
  ## waiters it will be resumed only after all waiters will be awakened and
  ## finished.

  # We are clearing pending list, so that all requests that are still running
  # around (still downloading, but not yet pushed to the SyncQueue) will be
  # expired. Its important to perform this call first (before await), otherwise
  # you can introduce race problem.
  sq.pending.clear()

  # We calculating minimal slot number to which we will be able to reset,
  # without missing any blocks. There 3 sources:
  # 1. Debts queue.
  # 2. Processing queue (`inpSlot`, `outSlot`).
  # 3. Requested slot `toSlot`.
  #
  # Queue's `outSlot` is the lowest slot we added to `block_pool`, but
  # `toSlot` slot can be less then `outSlot`. `debtsQueue` holds only not
  # added slot requests, so it can't be bigger then `outSlot` value.
  let minSlot =
    case sq.kind
    of SyncQueueKind.Forward:
      if toSlot.isSome():
        min(toSlot.get(), sq.outSlot)
      else:
        sq.outSlot
    of SyncQueueKind.Backward:
      if toSlot.isSome():
        toSlot.get()
      else:
        sq.outSlot
  sq.debtsQueue.clear()
  sq.debtsCount = 0
  sq.readyQueue.clear()
  sq.inpSlot = minSlot
  sq.outSlot = minSlot
  # We are going to wakeup all the waiters and wait for last one.
  await sq.wakeupAndWaitWaiters()

proc isEmpty*[T](sr: SyncResult[T]): bool {.inline.} =
  ## Returns ``true`` if response chain of blocks is empty (has only empty
  ## slots).
  len(sr.data) == 0

proc hasEndGap*[T](sr: SyncResult[T]): bool {.inline.} =
  ## Returns ``true`` if response chain of blocks has gap at the end.
  let lastslot = sr.request.slot + sr.request.count - 1'u64
  if len(sr.data) == 0:
    return true
  if sr.data[^1][].slot != lastslot:
    return true
  return false

proc getLastNonEmptySlot*[T](sr: SyncResult[T]): Slot {.inline.} =
  ## Returns last non-empty slot from result ``sr``. If response has only
  ## empty slots, original request slot will be returned.
  if len(sr.data) == 0:
    # If response has only empty slots we going to use original request slot
    sr.request.slot
  else:
    sr.data[^1][].slot

proc toDebtsQueue[T](sq: SyncQueue[T], sr: SyncRequest[T]) =
  sq.debtsQueue.push(sr)
  sq.debtsCount = sq.debtsCount + sr.count

proc getRewindPoint*[T](sq: SyncQueue[T], failSlot: Slot,
                        safeSlot: Slot): Slot =
  logScope:
    sync_ident = sq.ident
    direction = sq.kind
    topics = "syncman"

  case sq.kind
  of SyncQueueKind.Forward:
    # Calculate the latest finalized epoch.
    let finalizedEpoch = epoch(safeSlot)

    # Calculate failure epoch.
    let failEpoch = epoch(failSlot)

    # Calculate exponential rewind point in number of epochs.
    let epochCount =
      if sq.rewind.isSome():
        let rewind = sq.rewind.get()
        if failSlot == rewind.failSlot:
          # `MissingParent` happened at same slot so we increase rewind point by
          # factor of 2.
          if failEpoch > finalizedEpoch:
            let rewindPoint = rewind.epochCount shl 1
            if rewindPoint < rewind.epochCount:
              # If exponential rewind point produces `uint64` overflow we will
              # make rewind to latest finalized epoch.
              failEpoch - finalizedEpoch
            else:
              if (failEpoch < rewindPoint) or
                 (failEpoch - rewindPoint < finalizedEpoch):
                # If exponential rewind point points to position which is far
                # behind latest finalized epoch.
                failEpoch - finalizedEpoch
              else:
                rewindPoint
          else:
            warn "Trying to rewind over the last finalized epoch",
                 finalized_slot = safeSlot, fail_slot = failSlot,
                 finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
                 rewind_epoch_count = rewind.epochCount,
                 finalized_epoch = finalizedEpoch
            0'u64
        else:
          # `MissingParent` happened at different slot so we going to rewind for
          # 1 epoch only.
          if (failEpoch < 1'u64) or (failEpoch - 1'u64 < finalizedEpoch):
            warn "Сould not rewind further than the last finalized epoch",
                 finalized_slot = safeSlot, fail_slot = failSlot,
                 finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
                 rewind_epoch_count = rewind.epochCount,
                 finalized_epoch = finalizedEpoch
            0'u64
          else:
            1'u64
      else:
        # `MissingParent` happened first time.
        if (failEpoch < 1'u64) or (failEpoch - 1'u64 < finalizedEpoch):
          warn "Сould not rewind further than the last finalized epoch",
               finalized_slot = safeSlot, fail_slot = failSlot,
               finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
               finalized_epoch = finalizedEpoch
          0'u64
        else:
          1'u64

    if epochCount == 0'u64:
      warn "Unable to continue syncing, please restart the node",
           finalized_slot = safeSlot, fail_slot = failSlot,
           finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
           finalized_epoch = finalizedEpoch
      # Calculate the rewind epoch, which will be equal to last rewind point or
      # finalizedEpoch
      let rewindEpoch =
        if sq.rewind.isNone():
          finalizedEpoch
        else:
          epoch(sq.rewind.get().failSlot) - sq.rewind.get().epochCount
      rewindEpoch.start_slot()
    else:
      # Calculate the rewind epoch, which should not be less than the latest
      # finalized epoch.
      let rewindEpoch = failEpoch - epochCount
      # Update and save new rewind point in SyncQueue.
      sq.rewind = some(RewindPoint(failSlot: failSlot, epochCount: epochCount))
      rewindEpoch.start_slot()
  of SyncQueueKind.Backward:
    # While we perform backward sync, the only possible slot we could rewind is
    # latest stored block.
    if failSlot == safeSlot:
      warn "Unable to continue syncing, please restart the node",
           safe_slot = safeSlot, fail_slot = failSlot
    safeSlot

iterator blocks*[T](sq: SyncQueue[T],
                    sr: SyncResult[T]): ref ForkedSignedBeaconBlock =
  case sq.kind
  of SyncQueueKind.Forward:
    for i in countup(0, len(sr.data) - 1):
      yield sr.data[i]
  of SyncQueueKind.Backward:
    for i in countdown(len(sr.data) - 1, 0):
      yield sr.data[i]

proc advanceOutput*[T](sq: SyncQueue[T], number: uint64) =
  case sq.kind
  of SyncQueueKind.Forward:
    sq.outSlot = sq.outSlot + number
  of SyncQueueKind.Backward:
    sq.outSlot = sq.outSlot - number

proc advanceInput[T](sq: SyncQueue[T], number: uint64) =
  case sq.kind
  of SyncQueueKind.Forward:
    sq.inpSlot = sq.inpSlot + number
  of SyncQueueKind.Backward:
    sq.inpSlot = sq.inpSlot - number

proc notInRange[T](sq: SyncQueue[T], sr: SyncRequest[T]): bool =
  case sq.kind
  of SyncQueueKind.Forward:
    (sq.queueSize > 0) and (sr.slot > sq.outSlot)
  of SyncQueueKind.Backward:
    (sq.queueSize > 0) and (sr.lastSlot < sq.outSlot)

func numAlreadyKnownSlots[T](sq: SyncQueue[T], sr: SyncRequest[T]): uint64 =
  ## Compute the number of slots covered by a given `SyncRequest` that are
  ## already known and, hence, no longer relevant for sync progression.
  let
    outSlot = sq.outSlot
    lowSlot = sr.slot
    highSlot = sr.lastSlot
  case sq.kind
  of SyncQueueKind.Forward:
    if outSlot > highSlot:
      # Entire request is no longer relevant.
      sr.count
    elif outSlot > lowSlot:
      # Request is only partially relevant.
      outSlot - lowSlot
    else:
      # Entire request is still relevant.
      0
  of SyncQueueKind.Backward:
    if lowSlot > outSlot:
      # Entire request is no longer relevant.
      sr.count
    elif highSlot > outSlot:
      # Request is only partially relevant.
      highSlot - outSlot
    else:
      # Entire request is still relevant.
      0

proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T],
              data: seq[ref ForkedSignedBeaconBlock],
              processingCb: ProcessingCallback = nil) {.async.} =
  logScope:
    sync_ident = sq.ident
    topics = "syncman"

  ## Push successful result to queue ``sq``.
  mixin updateScore

  if sr.index notin sq.pending:
    # If request `sr` not in our pending list, it only means that
    # SyncQueue.resetWait() happens and all pending requests are expired, so
    # we swallow `old` requests, and in such way sync-workers are able to get
    # proper new requests from SyncQueue.
    return

  sq.pending.del(sr.index)

  # This is backpressure handling algorithm, this algorithm is blocking
  # all pending `push` requests if `request.slot` not in range.
  while true:
    if sq.notInRange(sr):
      let reset = await sq.waitForChanges()
      if reset:
        # SyncQueue reset happens. We are exiting to wake up sync-worker.
        return
    else:
      let syncres = SyncResult[T](request: sr, data: data)
      sq.readyQueue.push(syncres)
      break

  while len(sq.readyQueue) > 0:
    let reqres =
      case sq.kind
      of SyncQueueKind.Forward:
        let minSlot = sq.readyQueue[0].request.slot
        if sq.outSlot < minSlot:
          none[SyncResult[T]]()
        else:
          some(sq.readyQueue.pop())
      of SyncQueueKind.Backward:
        let maxslot = sq.readyQueue[0].request.slot +
                      (sq.readyQueue[0].request.count - 1'u64)
        if sq.outSlot > maxslot:
          none[SyncResult[T]]()
        else:
          some(sq.readyQueue.pop())

    let item =
      if reqres.isSome():
        reqres.get()
      else:
        let rewindSlot = sq.getRewindPoint(sq.outSlot, sq.getSafeSlot())
        warn "Got incorrect sync result in queue, rewind happens",
             blocks_map = getShortMap(sq.readyQueue[0].request,
                                      sq.readyQueue[0].data),
             blocks_count = len(sq.readyQueue[0].data),
             output_slot = sq.outSlot, input_slot = sq.inpSlot,
             rewind_to_slot = rewindSlot, request = sq.readyQueue[0].request
        await sq.resetWait(some(rewindSlot))
        break

    if processingCb != nil:
      processingCb()

    # Validating received blocks one by one
    var
      hasOkBlock = false
      hasInvalidBlock = false
      unviableBlock: Option[(Eth2Digest, Slot)]
      missingParentSlot: Option[Slot]

      # compiler segfault if this is moved into the for loop, at time of writing
      # TODO this does segfault in 1.2 but not 1.6, so remove workaround when 1.2
      # is dropped.
      res: Result[void, BlockError]

    for blk in sq.blocks(item):
      res = await sq.blockVerifier(blk[])
      if res.isOk():
        hasOkBlock = true
      else:
        case res.error()
        of BlockError.MissingParent:
          missingParentSlot = some(blk[].slot)
          break
        of BlockError.Duplicate:
          # Keep going, happens naturally
          discard
        of BlockError.UnviableFork:
          # Keep going so as to register other unviable blocks with the
          # quarantine
          if unviableBlock.isNone:
            # Remember the first unviable block, so we can log it
            unviableBlock = some((blk[].root, blk[].slot))

        of BlockError.Invalid:
          hasInvalidBlock = true

          let req = item.request
          warn "Received invalid sequence of blocks", request = req,
                blocks_count = len(item.data),
                blocks_map = getShortMap(req, item.data)
          req.item.updateScore(PeerScoreBadBlocks)
          break

    # When errors happen while processing blocks, we retry the same request
    # with, hopefully, a different peer
    let retryRequest =
      hasInvalidBlock or unviableBlock.isSome() or missingParentSlot.isSome()
    if not retryRequest:
      let numSlotsAdvanced = item.request.count - sq.numAlreadyKnownSlots(sr)
      sq.advanceOutput(numSlotsAdvanced)

      if hasOkBlock:
        # If there no error and response was not empty we should reward peer
        # with some bonus score - not for duplicate blocks though.
        item.request.item.updateScore(PeerScoreGoodBlocks)

      if numSlotsAdvanced > 0:
        sq.wakeupWaiters()
    else:
      debug "Block pool rejected peer's response", request = item.request,
            blocks_map = getShortMap(item.request, item.data),
            blocks_count = len(item.data),
            ok = hasOkBlock,
            unviable = unviableBlock.isSome(),
            missing_parent = missingParentSlot.isSome()
      # We need to move failed response to the debts queue.
      sq.toDebtsQueue(item.request)

      if unviableBlock.isSome:
        let req = item.request
        notice "Received blocks from an unviable fork", request = req,
              blockRoot = unviableBlock.get()[0],
              blockSlot = unviableBlock.get()[1],
              blocks_count = len(item.data),
              blocks_map = getShortMap(req, item.data)
        req.item.updateScore(PeerScoreUnviableFork)

      if missingParentSlot.isSome:
        var
          resetSlot: Option[Slot]
          failSlot = missingParentSlot.get()

        # If we got `BlockError.MissingParent` it means that peer returns chain
        # of blocks with holes or `block_pool` is in incomplete state. We going
        # to rewind to the first slot at latest finalized epoch.
        let
          req = item.request
          safeSlot = sq.getSafeSlot()
        case sq.kind
        of SyncQueueKind.Forward:
          if safeSlot < req.slot:
            let rewindSlot = sq.getRewindPoint(failSlot, safeSlot)
            warn "Unexpected missing parent, rewind happens",
                 request = req, rewind_to_slot = rewindSlot,
                 rewind_point = sq.rewind, finalized_slot = safeSlot,
                 blocks_count = len(item.data),
                 blocks_map = getShortMap(req, item.data)
            resetSlot = some(rewindSlot)
            req.item.updateScore(PeerScoreMissingBlocks)
          else:
            error "Unexpected missing parent at finalized epoch slot",
                  request = req, rewind_to_slot = safeSlot,
                  blocks_count = len(item.data),
                  blocks_map = getShortMap(req, item.data)
            req.item.updateScore(PeerScoreBadBlocks)
        of SyncQueueKind.Backward:
          if safeSlot > req.slot:
            let rewindSlot = sq.getRewindPoint(failSlot, safeSlot)
            # It's quite common peers give us fewer blocks than we ask for
            info "Gap in block range response, rewinding", request = req,
                 rewind_to_slot = rewindSlot, rewind_fail_slot = failSlot,
                 finalized_slot = safeSlot, blocks_count = len(item.data),
                 blocks_map = getShortMap(req, item.data)
            resetSlot = some(rewindSlot)
            req.item.updateScore(PeerScoreMissingBlocks)
          else:
            error "Unexpected missing parent at safe slot", request = req,
                  to_slot = safeSlot, blocks_count = len(item.data),
                  blocks_map = getShortMap(req, item.data)
            req.item.updateScore(PeerScoreBadBlocks)

        if resetSlot.isSome():
          await sq.resetWait(resetSlot)
          case sq.kind
          of SyncQueueKind.Forward:
            debug "Rewind to slot has happened", reset_slot = resetSlot.get(),
                  queue_input_slot = sq.inpSlot, queue_output_slot = sq.outSlot,
                  rewind_point = sq.rewind, direction = sq.kind
          of SyncQueueKind.Backward:
            debug "Rewind to slot has happened", reset_slot = resetSlot.get(),
                  queue_input_slot = sq.inpSlot, queue_output_slot = sq.outSlot,
                  direction = sq.kind

      break

proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T]) =
  ## Push failed request back to queue.
  if sr.index notin sq.pending:
    # If request `sr` not in our pending list, it only means that
    # SyncQueue.resetWait() happens and all pending requests are expired, so
    # we swallow `old` requests, and in such way sync-workers are able to get
    # proper new requests from SyncQueue.
    return
  sq.pending.del(sr.index)
  sq.toDebtsQueue(sr)

proc handlePotentialSafeSlotAdvancement[T](sq: SyncQueue[T]) =
  # It may happen that sync progress advanced to a newer `safeSlot`, either
  # by a response that started with good values and only had errors late, or
  # through an out-of-band mechanism, e.g., VC / REST.
  # If that happens, advance to the new `safeSlot` to avoid repeating requests
  # for data that is considered immutable and no longer relevant.
  let safeSlot = sq.getSafeSlot()
  func numSlotsBehindSafeSlot(slot: Slot): uint64 =
    case sq.kind
    of SyncQueueKind.Forward:
      if safeSlot > slot:
        safeSlot - slot
      else:
        0
    of SyncQueueKind.Backward:
      if slot > safeSlot:
        slot - safeSlot
      else:
        0

  let
    numOutSlotsAdvanced = sq.outSlot.numSlotsBehindSafeSlot
    numInpSlotsAdvanced =
      case sq.kind
      of SyncQueueKind.Forward:
        sq.inpSlot.numSlotsBehindSafeSlot
      of SyncQueueKind.Backward:
        if sq.inpSlot == 0xFFFF_FFFF_FFFF_FFFF'u64:
          0'u64
        else:
          sq.inpSlot.numSlotsBehindSafeSlot
  if numOutSlotsAdvanced != 0 or numInpSlotsAdvanced != 0:
    debug "Sync progress advanced out-of-band",
      safeSlot, outSlot = sq.outSlot, inpSlot = sq.inpSlot
    if numOutSlotsAdvanced != 0:
      sq.advanceOutput(numOutSlotsAdvanced)
    if numInpSlotsAdvanced != 0:
      sq.advanceInput(numInpSlotsAdvanced)
    sq.wakeupWaiters()

func updateRequestForNewSafeSlot[T](sq: SyncQueue[T], sr: var SyncRequest[T]) =
  # Requests may have originated before the latest `safeSlot` advancement.
  # Update it to not request any data prior to `safeSlot`.
  let
    outSlot = sq.outSlot
    lowSlot = sr.slot
    highSlot = sr.lastSlot
  case sq.kind
  of SyncQueueKind.Forward:
    if outSlot <= lowSlot:
      # Entire request is still relevant.
      discard
    elif outSlot <= highSlot:
      # Request is only partially relevant.
      let
        numSlotsDone = outSlot - lowSlot
        numStepsDone = (numSlotsDone + sr.step - 1) div sr.step
      sr.slot += numStepsDone * sr.step
      sr.count -= numStepsDone
    else:
      # Entire request is no longer relevant.
      sr.step = 0
      sr.count = 0
  of SyncQueueKind.Backward:
    if outSlot >= highSlot:
      # Entire request is still relevant.
      discard
    elif outSlot >= lowSlot:
      # Request is only partially relevant.
      let
        numSlotsDone = highSlot - outSlot
        numStepsDone = (numSlotsDone + sr.step - 1) div sr.step
      sr.count -= numStepsDone
    else:
      # Entire request is no longer relevant.
      sr.step = 0
      sr.count = 0

proc pop*[T](sq: SyncQueue[T], maxslot: Slot, item: T): SyncRequest[T] =
  ## Create new request according to current SyncQueue parameters.
  sq.handlePotentialSafeSlotAdvancement()
  while len(sq.debtsQueue) > 0:
    if maxslot < sq.debtsQueue[0].slot:
      # Peer's latest slot is less than starting request's slot.
      return SyncRequest.empty(sq.kind, T)
    if maxslot < sq.debtsQueue[0].lastSlot():
      # Peer's latest slot is less than finishing request's slot.
      return SyncRequest.empty(sq.kind, T)
    var sr = sq.debtsQueue.pop()
    sq.debtsCount = sq.debtsCount - sr.count
    sq.updateRequestForNewSafeSlot(sr)
    if sr.isEmpty:
      continue
    sr.setItem(item)
    sq.makePending(sr)
    return sr

  case sq.kind
  of SyncQueueKind.Forward:
    if maxslot < sq.inpSlot:
      # Peer's latest slot is less than queue's input slot.
      return SyncRequest.empty(sq.kind, T)
    if sq.inpSlot > sq.finalSlot:
      # Queue's input slot is bigger than queue's final slot.
      return SyncRequest.empty(sq.kind, T)
    let lastSlot = min(maxslot, sq.finalSlot)
    let count = min(sq.chunkSize, lastSlot + 1'u64 - sq.inpSlot)
    var sr = SyncRequest.init(sq.kind, sq.inpSlot, count, item)
    sq.advanceInput(count)
    sq.makePending(sr)
    sr
  of SyncQueueKind.Backward:
    if sq.inpSlot == 0xFFFF_FFFF_FFFF_FFFF'u64:
      return SyncRequest.empty(sq.kind, T)
    if sq.inpSlot < sq.finalSlot:
      return SyncRequest.empty(sq.kind, T)
    let (slot, count) =
      block:
        let baseSlot = sq.inpSlot + 1'u64
        if baseSlot - sq.finalSlot < sq.chunkSize:
          let count = uint64(baseSlot - sq.finalSlot)
          (baseSlot - count, count)
        else:
          (baseSlot - sq.chunkSize, sq.chunkSize)
    if (maxslot + 1'u64) < slot + count:
      # Peer's latest slot is less than queue's input slot.
      return SyncRequest.empty(sq.kind, T)
    var sr = SyncRequest.init(sq.kind, slot, count, item)
    sq.advanceInput(count)
    sq.makePending(sr)
    sr

proc debtLen*[T](sq: SyncQueue[T]): uint64 =
  sq.debtsCount

proc pendingLen*[T](sq: SyncQueue[T]): uint64 =
  case sq.kind
  of SyncQueueKind.Forward:
    # When moving forward `outSlot` will be <= of `inpSlot`.
    sq.inpSlot - sq.outSlot
  of SyncQueueKind.Backward:
    # When moving backward `outSlot` will be >= of `inpSlot`
    sq.outSlot - sq.inpSlot

proc len*[T](sq: SyncQueue[T]): uint64 {.inline.} =
  ## Returns number of slots left in queue ``sq``.
  case sq.kind
  of SyncQueueKind.Forward:
    sq.finalSlot + 1'u64 - sq.outSlot
  of SyncQueueKind.Backward:
    sq.outSlot + 1'u64 - sq.finalSlot

proc total*[T](sq: SyncQueue[T]): uint64 {.inline.} =
  ## Returns total number of slots in queue ``sq``.
  case sq.kind
  of SyncQueueKind.Forward:
    sq.finalSlot + 1'u64 - sq.startSlot
  of SyncQueueKind.Backward:
    sq.startSlot + 1'u64 - sq.finalSlot

proc progress*[T](sq: SyncQueue[T]): uint64 =
  ## How many slots we've synced so far
  case sq.kind
  of SyncQueueKind.Forward:
    sq.outSlot - sq.startSlot
  of SyncQueueKind.Backward:
    sq.startSlot - sq.outSlot
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								# beacon_chain
-												complete switch to beacon_chain/specs/datatypes/bellatrix (#3295)


											
										
										
											2022-01-18 13:36:52 +00:00
+								# Copyright (c) 2018-2022 Status Research & Development GmbH
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								# Licensed and distributed under either of
 								#   * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT).
 								#   * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0).
 								# at your option. This file may not be copied, modified, or distributed except according to those terms.
 								{.push raises: [Defect].}
 								import std/[options, heapqueue, tables, strutils, sequtils, math, algorithm]
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								import stew/[results, base10], chronos, chronicles
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								import
-												complete switch to beacon_chain/specs/datatypes/bellatrix (#3295)


											
										
										
											2022-01-18 13:36:52 +00:00
+								  ../spec/datatypes/[base, phase0, altair],
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  ../spec/[helpers, forks],
 								  ../networking/[peer_pool, eth2_network],
 								  ../gossip_processing/block_processor,
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								  ../consensus_object_pools/block_pools_types
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								export base, phase0, altair, merge, chronos, chronicles, results,
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								       block_pools_types, helpers
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								logScope:
 								  topics = "syncqueue"
 								type
 								  GetSlotCallback* = proc(): Slot {.gcsafe, raises: [Defect].}
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								  ProcessingCallback* = proc() {.gcsafe, raises: [Defect].}
 								  BlockVerifier* =
 								    proc(signedBlock: ForkedSignedBeaconBlock):
 								      Future[Result[void, BlockError]] {.gcsafe, raises: [Defect].}
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								  SyncQueueKind* {.pure.} = enum
 								    Forward, Backward
 								  SyncRequest*[T] = object
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								    kind*: SyncQueueKind
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    index*: uint64
 								    slot*: Slot
 								    count*: uint64
 								    step*: uint64
 								    item*: T
 								  SyncResult*[T] = object
 								    request*: SyncRequest[T]
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								    data*: seq[ref ForkedSignedBeaconBlock]
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								  SyncWaiter* = ref object
 								    future: Future[void]
 								    reset: bool
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								  RewindPoint = object
 								    failSlot: Slot
 								    epochCount: uint64
 								  SyncQueue*[T] = ref object
 								    kind*: SyncQueueKind
 								    inpSlot*: Slot
 								    outSlot*: Slot
 								    startSlot*: Slot
 								    finalSlot*: Slot
 								    chunkSize*: uint64
 								    queueSize*: int
 								    counter*: uint64
 								    pending*: Table[uint64, SyncRequest[T]]
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								    waiters: seq[SyncWaiter]
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    getSafeSlot*: GetSlotCallback
 								    debtsQueue: HeapQueue[SyncRequest[T]]
 								    debtsCount: uint64
 								    readyQueue: HeapQueue[SyncResult[T]]
 								    rewind: Option[RewindPoint]
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								    blockVerifier: BlockVerifier
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								    ident*: string
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								  SyncManagerError* = object of CatchableError
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								  BeaconBlocksRes* = NetRes[seq[ref ForkedSignedBeaconBlock]]
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								chronicles.formatIt SyncQueueKind: toLowerAscii($it)
 								template shortLog*[T](req: SyncRequest[T]): string =
 								  Base10.toString(uint64(req.slot)) & ":" &
 								  Base10.toString(req.count) & "@" &
 								  Base10.toString(req.index)
 								chronicles.expandIt SyncRequest:
 								  `it` = shortLog(it)
 								  peer = shortLog(it.item)
 								  direction = toLowerAscii($it.kind)
-												Backfiller (#3263)

Backfilling is the process of downloading historical blocks via P2P that
are required to fulfill `GetBlocksByRange` duties - this happens during
both trusted node and finalized checkpoint syncs.

In particular, backfilling happens after syncing to head, such that
attestation work can start as soon as possible.

* Fix SyncQueue initialization procedure.
Remove usage of `awaitne`.
Add cancellation support.
Remove unneeded `sleepAsync()` if peer's head is older than needed.
Add `direction` field to all logs.
Fix syncmanager wedge issue.
Add proper resource cleaning procedure on backward sync finish.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-01-20 07:25:45 +00:00
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								proc getShortMap*[T](req: SyncRequest[T],
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								                     data: openArray[ref ForkedSignedBeaconBlock]): string =
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  ## Returns all slot numbers in ``data`` as placement map.
 								  var res = newStringOfCap(req.count)
 								  var slider = req.slot
 								  var last = 0
 								  for i in 0 ..< req.count:
 								    if last < len(data):
 								      for k in last ..< len(data):
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								        if slider == data[k][].slot:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								          res.add('x')
 								          last = k + 1
 								          break
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								        elif slider < data[k][].slot:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								          res.add('.')
 								          break
 								    else:
 								      res.add('.')
 								    slider = slider + req.step
 								  res
 								proc contains*[T](req: SyncRequest[T], slot: Slot): bool {.inline.} =
 								  slot >= req.slot and slot < req.slot + req.count * req.step and
 								    ((slot - req.slot) mod req.step == 0)
 								proc cmp*[T](a, b: SyncRequest[T]): int =
 								  cmp(uint64(a.slot), uint64(b.slot))
 								proc checkResponse*[T](req: SyncRequest[T],
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								                       data: openArray[ref ForkedSignedBeaconBlock]): bool =
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  if len(data) == 0:
 								    # Impossible to verify empty response.
 								    return true
 								  if uint64(len(data)) > req.count:
 								    # Number of blocks in response should be less or equal to number of
 								    # requested blocks.
 								    return false
 								  var slot = req.slot
 								  var rindex = 0'u64
 								  var dindex = 0
 								  while (rindex < req.count) and (dindex < len(data)):
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								    if slot < data[dindex][].slot:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								      discard
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								    elif slot == data[dindex][].slot:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								      inc(dindex)
 								    else:
 								      return false
 								    slot = slot + req.step
 								    rindex = rindex + 1'u64
 								  if dindex == len(data):
 								    return true
 								  else:
 								    return false
 								proc getFullMap*[T](req: SyncRequest[T],
 								                    data: openArray[ForkedSignedBeaconBlock]): string =
 								  # Returns all slot numbers in ``data`` as comma-delimeted string.
 								  mapIt(data, $it.message.slot).join(", ")
 								proc init[T](t1: typedesc[SyncRequest], kind: SyncQueueKind, start: Slot,
 								             finish: Slot, t2: typedesc[T]): SyncRequest[T] =
 								  let count = finish - start + 1'u64
 								  SyncRequest[T](kind: kind, slot: start, count: count, step: 1'u64)
 								proc init[T](t1: typedesc[SyncRequest], kind: SyncQueueKind, slot: Slot,
 								             count: uint64, item: T): SyncRequest[T] =
 								  SyncRequest[T](kind: kind, slot: slot, count: count, item: item, step: 1'u64)
 								proc init[T](t1: typedesc[SyncRequest], kind: SyncQueueKind, start: Slot,
 								             finish: Slot, item: T): SyncRequest[T] =
 								  let count = finish - start + 1'u64
 								  SyncRequest[T](kind: kind, slot: start, count: count, step: 1'u64, item: item)
 								proc empty*[T](t: typedesc[SyncRequest], kind: SyncQueueKind,
 								               t2: typedesc[T]): SyncRequest[T] {.inline.} =
 								  SyncRequest[T](kind: kind, step: 0'u64, count: 0'u64)
 								proc setItem*[T](sr: var SyncRequest[T], item: T) =
 								  sr.item = item
 								proc isEmpty*[T](sr: SyncRequest[T]): bool {.inline.} =
 								  (sr.step == 0'u64) and (sr.count == 0'u64)
 								proc init*[T](t1: typedesc[SyncQueue], t2: typedesc[T],
 								              queueKind: SyncQueueKind,
 								              start, final: Slot, chunkSize: uint64,
 								              getSafeSlotCb: GetSlotCallback,
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								              blockVerifier: BlockVerifier,
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								              syncQueueSize: int = -1,
 								              ident: string = "main"): SyncQueue[T] =
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  ## Create new synchronization queue with parameters
 								  ##
 								  ## ``start`` and ``last`` are starting and finishing Slots.
 								  ##
 								  ## ``chunkSize`` maximum number of slots in one request.
 								  ##
 								  ## ``syncQueueSize`` maximum queue size for incoming data.
 								  ## If ``syncQueueSize > 0`` queue will help to keep backpressure under
 								  ## control. If ``syncQueueSize <= 0`` then queue size is unlimited (default).
 								  # SyncQueue is the core of sync manager, this data structure distributes
 								  # requests to peers and manages responses from peers.
 								  #
 								  # Because SyncQueue is async data structure it manages backpressure and
 								  # order of incoming responses and it also resolves "joker's" problem.
 								  #
 								  # Joker's problem
 								  #
 								  # According to current Ethereum2 network specification
 								  # > Clients MUST respond with at least one block, if they have it and it
 								  # > exists in the range. Clients MAY limit the number of blocks in the
 								  # > response.
 								  #
 								  # Such rule can lead to very uncertain responses, for example let slots from
 								  # 10 to 12 will be not empty. Client which follows specification can answer
 								  # with any response from this list (X - block, `-` empty space):
 								  #
 								  # 1.   X X X
 								  # 2.   - - X
 								  # 3.   - X -
 								  # 4.   - X X
 								  # 5.   X - -
 								  # 6.   X - X
 								  # 7.   X X -
 								  #
 								  # If peer answers with `1` everything will be fine and `block_pool` will be
 								  # able to process all 3 blocks. In case of `2`, `3`, `4`, `6` - `block_pool`
 								  # will fail immediately with chunk and report "parent is missing" error.
 								  # But in case of `5` and `7` blocks will be processed by `block_pool` without
 								  # any problems, however it will start producing problems right from this
 								  # uncertain last slot. SyncQueue will start producing requests for next
 								  # blocks, but all the responses from this point will fail with "parent is
 								  # missing" error. Lets call such peers "jokers", because they are joking
 								  # with responses.
 								  #
 								  # To fix "joker" problem we going to perform rollback to the latest finalized
 								  # epoch's first slot.
 								  doAssert(chunkSize > 0'u64, "Chunk size should not be zero")
 								  SyncQueue[T](
 								    kind: queueKind,
 								    startSlot: start,
 								    finalSlot: final,
 								    chunkSize: chunkSize,
 								    queueSize: syncQueueSize,
 								    getSafeSlot: getSafeSlotCb,
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								    waiters: newSeq[SyncWaiter](),
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    counter: 1'u64,
 								    pending: initTable[uint64, SyncRequest[T]](),
 								    debtsQueue: initHeapQueue[SyncRequest[T]](),
 								    inpSlot: start,
 								    outSlot: start,
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								    blockVerifier: blockVerifier,
 								    ident: ident
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  )
 								proc `<`*[T](a, b: SyncRequest[T]): bool =
 								  doAssert(a.kind == b.kind)
 								  case a.kind
 								  of SyncQueueKind.Forward:
 								    a.slot < b.slot
 								  of SyncQueueKind.Backward:
 								    a.slot > b.slot
 								proc `<`*[T](a, b: SyncResult[T]): bool =
 								  doAssert(a.request.kind == b.request.kind)
 								  case a.request.kind
 								  of SyncQueueKind.Forward:
 								    a.request.slot < b.request.slot
 								  of SyncQueueKind.Backward:
 								    a.request.slot > b.request.slot
 								proc `==`*[T](a, b: SyncRequest[T]): bool =
 								  (a.kind == b.kind) and (a.slot == b.slot) and (a.count == b.count) and
 								    (a.step == b.step)
 								proc lastSlot*[T](req: SyncRequest[T]): Slot =
 								  ## Returns last slot for request ``req``.
 								  req.slot + req.count - 1'u64
 								proc makePending*[T](sq: SyncQueue[T], req: var SyncRequest[T]) =
 								  req.index = sq.counter
 								  sq.counter = sq.counter + 1'u64
 								  sq.pending[req.index] = req
 								proc updateLastSlot*[T](sq: SyncQueue[T], last: Slot) {.inline.} =
 								  ## Update last slot stored in queue ``sq`` with value ``last``.
 								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    doAssert(sq.finalSlot <= last,
 								             "Last slot could not be lower then stored one " &
 								             $sq.finalSlot & " <= " & $last)
 								    sq.finalSlot = last
 								  of SyncQueueKind.Backward:
 								    doAssert(sq.finalSlot >= last,
 								             "Last slot could not be higher then stored one " &
 								             $sq.finalSlot & " >= " & $last)
 								    sq.finalSlot = last
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								proc wakeupWaiters[T](sq: SyncQueue[T], reset = false) =
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  ## Wakeup one or all blocked waiters.
 								  for item in sq.waiters:
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								    if reset:
 								      item.reset = true
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    if not(item.future.finished()):
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								      item.future.complete()
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								proc waitForChanges[T](sq: SyncQueue[T]): Future[bool] {.async.} =
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  ## Create new waiter and wait for completion from `wakeupWaiters()`.
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								  var waitfut = newFuture[void]("SyncQueue.waitForChanges")
 								  let waititem = SyncWaiter(future: waitfut)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  sq.waiters.add(waititem)
 								  try:
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								    await waitfut
 								    return waititem.reset
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  finally:
 								    sq.waiters.delete(sq.waiters.find(waititem))
 								proc wakeupAndWaitWaiters[T](sq: SyncQueue[T]) {.async.} =
 								  ## This procedure will perform wakeupWaiters(false) and blocks until last
 								  ## waiter will be awakened.
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								  var waitChanges = sq.waitForChanges()
 								  sq.wakeupWaiters(true)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  discard await waitChanges
-												Backfiller (#3263)

Backfilling is the process of downloading historical blocks via P2P that
are required to fulfill `GetBlocksByRange` duties - this happens during
both trusted node and finalized checkpoint syncs.

In particular, backfilling happens after syncing to head, such that
attestation work can start as soon as possible.

* Fix SyncQueue initialization procedure.
Remove usage of `awaitne`.
Add cancellation support.
Remove unneeded `sleepAsync()` if peer's head is older than needed.
Add `direction` field to all logs.
Fix syncmanager wedge issue.
Add proper resource cleaning procedure on backward sync finish.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-01-20 07:25:45 +00:00
+								proc clearAndWakeup*[T](sq: SyncQueue[T]) =
 								  sq.pending.clear()
 								  sq.wakeupWaiters(true)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								proc resetWait*[T](sq: SyncQueue[T], toSlot: Option[Slot]) {.async.} =
 								  ## Perform reset of all the blocked waiters in SyncQueue.
 								  ##
 								  ## We adding one more waiter to the waiters sequence and
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								  ## call wakeupWaiters(true). Because our waiter is last in sequence of
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  ## waiters it will be resumed only after all waiters will be awakened and
 								  ## finished.
 								  # We are clearing pending list, so that all requests that are still running
 								  # around (still downloading, but not yet pushed to the SyncQueue) will be
 								  # expired. Its important to perform this call first (before await), otherwise
 								  # you can introduce race problem.
 								  sq.pending.clear()
 								  # We calculating minimal slot number to which we will be able to reset,
 								  # without missing any blocks. There 3 sources:
 								  # 1. Debts queue.
 								  # 2. Processing queue (`inpSlot`, `outSlot`).
 								  # 3. Requested slot `toSlot`.
 								  #
 								  # Queue's `outSlot` is the lowest slot we added to `block_pool`, but
 								  # `toSlot` slot can be less then `outSlot`. `debtsQueue` holds only not
 								  # added slot requests, so it can't be bigger then `outSlot` value.
 								  let minSlot =
 								    case sq.kind
 								    of SyncQueueKind.Forward:
 								      if toSlot.isSome():
 								        min(toSlot.get(), sq.outSlot)
 								      else:
 								        sq.outSlot
 								    of SyncQueueKind.Backward:
 								      if toSlot.isSome():
 								        toSlot.get()
 								      else:
 								        sq.outSlot
 								  sq.debtsQueue.clear()
 								  sq.debtsCount = 0
 								  sq.readyQueue.clear()
 								  sq.inpSlot = minSlot
 								  sq.outSlot = minSlot
 								  # We are going to wakeup all the waiters and wait for last one.
 								  await sq.wakeupAndWaitWaiters()
 								proc isEmpty*[T](sr: SyncResult[T]): bool {.inline.} =
 								  ## Returns ``true`` if response chain of blocks is empty (has only empty
 								  ## slots).
 								  len(sr.data) == 0
 								proc hasEndGap*[T](sr: SyncResult[T]): bool {.inline.} =
 								  ## Returns ``true`` if response chain of blocks has gap at the end.
 								  let lastslot = sr.request.slot + sr.request.count - 1'u64
 								  if len(sr.data) == 0:
 								    return true
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								  if sr.data[^1][].slot != lastslot:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    return true
 								  return false
 								proc getLastNonEmptySlot*[T](sr: SyncResult[T]): Slot {.inline.} =
 								  ## Returns last non-empty slot from result ``sr``. If response has only
 								  ## empty slots, original request slot will be returned.
 								  if len(sr.data) == 0:
 								    # If response has only empty slots we going to use original request slot
 								    sr.request.slot
 								  else:
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								    sr.data[^1][].slot
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								proc toDebtsQueue[T](sq: SyncQueue[T], sr: SyncRequest[T]) =
 								  sq.debtsQueue.push(sr)
 								  sq.debtsCount = sq.debtsCount + sr.count
 								proc getRewindPoint*[T](sq: SyncQueue[T], failSlot: Slot,
 								                        safeSlot: Slot): Slot =
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								  logScope:
 								    sync_ident = sq.ident
 								    direction = sq.kind
 								    topics = "syncman"
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    # Calculate the latest finalized epoch.
-												time: spring cleaning (#3262)

Time in the beacon chain is expressed relative to the genesis time -
this PR creates a `beacon_time` module that collects helpers and
utilities for dealing the time units - the new module does not deal with
actual wall time (that's remains in `beacon_clock`).

Collecting the time related stuff in one place makes it easier to find,
avoids some circular imports and allows more easily identifying the code
actually needs wall time to operate.

* move genesis-time-related functionality into `spec/beacon_time`
* avoid using `chronos.Duration` for time differences - it does not
support negative values (such as when something happens earlier than it
should)
* saturate conversions between `FAR_FUTURE_XXX`, so as to avoid
overflows
* fix delay reporting in validator client so it uses the expected
deadline of the slot, not "closest wall slot"
* simplify looping over the slots of an epoch
* `compute_start_slot_at_epoch` -> `start_slot`
* `compute_epoch_at_slot` -> `epoch`

A follow-up PR will (likely) introduce saturating arithmetic for the
time units - this is merely code moves, renames and fixing of small
bugs.
											
										
										
											2022-01-11 10:01:54 +00:00
+								    let finalizedEpoch = epoch(safeSlot)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								    # Calculate failure epoch.
-												time: spring cleaning (#3262)

Time in the beacon chain is expressed relative to the genesis time -
this PR creates a `beacon_time` module that collects helpers and
utilities for dealing the time units - the new module does not deal with
actual wall time (that's remains in `beacon_clock`).

Collecting the time related stuff in one place makes it easier to find,
avoids some circular imports and allows more easily identifying the code
actually needs wall time to operate.

* move genesis-time-related functionality into `spec/beacon_time`
* avoid using `chronos.Duration` for time differences - it does not
support negative values (such as when something happens earlier than it
should)
* saturate conversions between `FAR_FUTURE_XXX`, so as to avoid
overflows
* fix delay reporting in validator client so it uses the expected
deadline of the slot, not "closest wall slot"
* simplify looping over the slots of an epoch
* `compute_start_slot_at_epoch` -> `start_slot`
* `compute_epoch_at_slot` -> `epoch`

A follow-up PR will (likely) introduce saturating arithmetic for the
time units - this is merely code moves, renames and fixing of small
bugs.
											
										
										
											2022-01-11 10:01:54 +00:00
+								    let failEpoch = epoch(failSlot)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								    # Calculate exponential rewind point in number of epochs.
 								    let epochCount =
 								      if sq.rewind.isSome():
 								        let rewind = sq.rewind.get()
 								        if failSlot == rewind.failSlot:
 								          # `MissingParent` happened at same slot so we increase rewind point by
 								          # factor of 2.
 								          if failEpoch > finalizedEpoch:
 								            let rewindPoint = rewind.epochCount shl 1
 								            if rewindPoint < rewind.epochCount:
 								              # If exponential rewind point produces `uint64` overflow we will
 								              # make rewind to latest finalized epoch.
 								              failEpoch - finalizedEpoch
 								            else:
 								              if (failEpoch < rewindPoint) or
 								                 (failEpoch - rewindPoint < finalizedEpoch):
 								                # If exponential rewind point points to position which is far
 								                # behind latest finalized epoch.
 								                failEpoch - finalizedEpoch
 								              else:
 								                rewindPoint
 								          else:
 								            warn "Trying to rewind over the last finalized epoch",
 								                 finalized_slot = safeSlot, fail_slot = failSlot,
 								                 finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
 								                 rewind_epoch_count = rewind.epochCount,
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								                 finalized_epoch = finalizedEpoch
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+'u64
 								        else:
 								          # `MissingParent` happened at different slot so we going to rewind for
 								          # 1 epoch only.
 								          if (failEpoch < 1'u64) or (failEpoch - 1'u64 < finalizedEpoch):
 								            warn "Сould not rewind further than the last finalized epoch",
 								                 finalized_slot = safeSlot, fail_slot = failSlot,
 								                 finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
 								                 rewind_epoch_count = rewind.epochCount,
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								                 finalized_epoch = finalizedEpoch
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+'u64
 								          else:
 'u64
 								      else:
 								        # `MissingParent` happened first time.
 								        if (failEpoch < 1'u64) or (failEpoch - 1'u64 < finalizedEpoch):
 								          warn "Сould not rewind further than the last finalized epoch",
 								               finalized_slot = safeSlot, fail_slot = failSlot,
 								               finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								               finalized_epoch = finalizedEpoch
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+'u64
 								        else:
 'u64
 								    if epochCount == 0'u64:
 								      warn "Unable to continue syncing, please restart the node",
 								           finalized_slot = safeSlot, fail_slot = failSlot,
 								           finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								           finalized_epoch = finalizedEpoch
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								      # Calculate the rewind epoch, which will be equal to last rewind point or
 								      # finalizedEpoch
 								      let rewindEpoch =
 								        if sq.rewind.isNone():
 								          finalizedEpoch
 								        else:
-												time: spring cleaning (#3262)

Time in the beacon chain is expressed relative to the genesis time -
this PR creates a `beacon_time` module that collects helpers and
utilities for dealing the time units - the new module does not deal with
actual wall time (that's remains in `beacon_clock`).

Collecting the time related stuff in one place makes it easier to find,
avoids some circular imports and allows more easily identifying the code
actually needs wall time to operate.

* move genesis-time-related functionality into `spec/beacon_time`
* avoid using `chronos.Duration` for time differences - it does not
support negative values (such as when something happens earlier than it
should)
* saturate conversions between `FAR_FUTURE_XXX`, so as to avoid
overflows
* fix delay reporting in validator client so it uses the expected
deadline of the slot, not "closest wall slot"
* simplify looping over the slots of an epoch
* `compute_start_slot_at_epoch` -> `start_slot`
* `compute_epoch_at_slot` -> `epoch`

A follow-up PR will (likely) introduce saturating arithmetic for the
time units - this is merely code moves, renames and fixing of small
bugs.
											
										
										
											2022-01-11 10:01:54 +00:00
+								          epoch(sq.rewind.get().failSlot) - sq.rewind.get().epochCount
 								      rewindEpoch.start_slot()
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    else:
 								      # Calculate the rewind epoch, which should not be less than the latest
 								      # finalized epoch.
 								      let rewindEpoch = failEpoch - epochCount
 								      # Update and save new rewind point in SyncQueue.
 								      sq.rewind = some(RewindPoint(failSlot: failSlot, epochCount: epochCount))
-												time: spring cleaning (#3262)

Time in the beacon chain is expressed relative to the genesis time -
this PR creates a `beacon_time` module that collects helpers and
utilities for dealing the time units - the new module does not deal with
actual wall time (that's remains in `beacon_clock`).

Collecting the time related stuff in one place makes it easier to find,
avoids some circular imports and allows more easily identifying the code
actually needs wall time to operate.

* move genesis-time-related functionality into `spec/beacon_time`
* avoid using `chronos.Duration` for time differences - it does not
support negative values (such as when something happens earlier than it
should)
* saturate conversions between `FAR_FUTURE_XXX`, so as to avoid
overflows
* fix delay reporting in validator client so it uses the expected
deadline of the slot, not "closest wall slot"
* simplify looping over the slots of an epoch
* `compute_start_slot_at_epoch` -> `start_slot`
* `compute_epoch_at_slot` -> `epoch`

A follow-up PR will (likely) introduce saturating arithmetic for the
time units - this is merely code moves, renames and fixing of small
bugs.
											
										
										
											2022-01-11 10:01:54 +00:00
+								      rewindEpoch.start_slot()
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  of SyncQueueKind.Backward:
 								    # While we perform backward sync, the only possible slot we could rewind is
 								    # latest stored block.
 								    if failSlot == safeSlot:
 								      warn "Unable to continue syncing, please restart the node",
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								           safe_slot = safeSlot, fail_slot = failSlot
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    safeSlot
 								iterator blocks*[T](sq: SyncQueue[T],
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								                    sr: SyncResult[T]): ref ForkedSignedBeaconBlock =
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    for i in countup(0, len(sr.data) - 1):
 								      yield sr.data[i]
 								  of SyncQueueKind.Backward:
 								    for i in countdown(len(sr.data) - 1, 0):
 								      yield sr.data[i]
 								proc advanceOutput*[T](sq: SyncQueue[T], number: uint64) =
 								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    sq.outSlot = sq.outSlot + number
 								  of SyncQueueKind.Backward:
 								    sq.outSlot = sq.outSlot - number
 								proc advanceInput[T](sq: SyncQueue[T], number: uint64) =
 								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    sq.inpSlot = sq.inpSlot + number
 								  of SyncQueueKind.Backward:
 								    sq.inpSlot = sq.inpSlot - number
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								proc notInRange[T](sq: SyncQueue[T], sr: SyncRequest[T]): bool =
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  case sq.kind
 								  of SyncQueueKind.Forward:
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								    (sq.queueSize > 0) and (sr.slot > sq.outSlot)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  of SyncQueueKind.Backward:
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								    (sq.queueSize > 0) and (sr.lastSlot < sq.outSlot)
 								func numAlreadyKnownSlots[T](sq: SyncQueue[T], sr: SyncRequest[T]): uint64 =
 								  ## Compute the number of slots covered by a given `SyncRequest` that are
 								  ## already known and, hence, no longer relevant for sync progression.
 								  let
 								    outSlot = sq.outSlot
 								    lowSlot = sr.slot
 								    highSlot = sr.lastSlot
 								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    if outSlot > highSlot:
 								      # Entire request is no longer relevant.
 								      sr.count
 								    elif outSlot > lowSlot:
 								      # Request is only partially relevant.
 								      outSlot - lowSlot
 								    else:
 								      # Entire request is still relevant.
 
 								  of SyncQueueKind.Backward:
 								    if lowSlot > outSlot:
 								      # Entire request is no longer relevant.
 								      sr.count
 								    elif highSlot > outSlot:
 								      # Request is only partially relevant.
 								      highSlot - outSlot
 								    else:
 								      # Entire request is still relevant.
 
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T],
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								              data: seq[ref ForkedSignedBeaconBlock],
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								              processingCb: ProcessingCallback = nil) {.async.} =
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								  logScope:
 								    sync_ident = sq.ident
 								    topics = "syncman"
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  ## Push successful result to queue ``sq``.
 								  mixin updateScore
 								  if sr.index notin sq.pending:
 								    # If request `sr` not in our pending list, it only means that
 								    # SyncQueue.resetWait() happens and all pending requests are expired, so
 								    # we swallow `old` requests, and in such way sync-workers are able to get
 								    # proper new requests from SyncQueue.
 								    return
 								  sq.pending.del(sr.index)
 								  # This is backpressure handling algorithm, this algorithm is blocking
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								  # all pending `push` requests if `request.slot` not in range.
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								  while true:
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								    if sq.notInRange(sr):
 								      let reset = await sq.waitForChanges()
 								      if reset:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								        # SyncQueue reset happens. We are exiting to wake up sync-worker.
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								        return
 								    else:
 								      let syncres = SyncResult[T](request: sr, data: data)
 								      sq.readyQueue.push(syncres)
 								      break
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								  while len(sq.readyQueue) > 0:
 								    let reqres =
 								      case sq.kind
 								      of SyncQueueKind.Forward:
 								        let minSlot = sq.readyQueue[0].request.slot
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								        if sq.outSlot < minSlot:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								          none[SyncResult[T]]()
 								        else:
 								          some(sq.readyQueue.pop())
 								      of SyncQueueKind.Backward:
-												enable `styleCheck:usages` (#3573)

Some upstream repos still need fixes, but this gets us close enough that
style hints can be enabled by default.

In general, "canonical" spellings are preferred even if they violate
nep-1 - this applies in particular to spec-related stuff like
`genesis_validators_root` which appears throughout the codebase.

											
										
										
											2022-04-08 16:22:49 +00:00
+								        let maxslot = sq.readyQueue[0].request.slot +
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								                      (sq.readyQueue[0].request.count - 1'u64)
-												enable `styleCheck:usages` (#3573)

Some upstream repos still need fixes, but this gets us close enough that
style hints can be enabled by default.

In general, "canonical" spellings are preferred even if they violate
nep-1 - this applies in particular to spec-related stuff like
`genesis_validators_root` which appears throughout the codebase.

											
										
										
											2022-04-08 16:22:49 +00:00
+								        if sq.outSlot > maxslot:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								          none[SyncResult[T]]()
 								        else:
 								          some(sq.readyQueue.pop())
 								    let item =
 								      if reqres.isSome():
 								        reqres.get()
 								      else:
 								        let rewindSlot = sq.getRewindPoint(sq.outSlot, sq.getSafeSlot())
 								        warn "Got incorrect sync result in queue, rewind happens",
 								             blocks_map = getShortMap(sq.readyQueue[0].request,
 								                                      sq.readyQueue[0].data),
 								             blocks_count = len(sq.readyQueue[0].data),
 								             output_slot = sq.outSlot, input_slot = sq.inpSlot,
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								             rewind_to_slot = rewindSlot, request = sq.readyQueue[0].request
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								        await sq.resetWait(some(rewindSlot))
 								        break
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								    if processingCb != nil:
 								      processingCb()
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    # Validating received blocks one by one
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								    var
 								      hasOkBlock = false
 								      hasInvalidBlock = false
 								      unviableBlock: Option[(Eth2Digest, Slot)]
 								      missingParentSlot: Option[Slot]
 								      # compiler segfault if this is moved into the for loop, at time of writing
-												disallow non-(genesis, far-future) equal transition epochs (#3691)


											
										
										
											2022-06-03 09:37:03 +00:00
+								      # TODO this does segfault in 1.2 but not 1.6, so remove workaround when 1.2
 								      # is dropped.
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								      res: Result[void, BlockError]
 								    for blk in sq.blocks(item):
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								      res = await sq.blockVerifier(blk[])
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								      if res.isOk():
 								        hasOkBlock = true
 								      else:
 								        case res.error()
 								        of BlockError.MissingParent:
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								          missingParentSlot = some(blk[].slot)
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								          break
 								        of BlockError.Duplicate:
 								          # Keep going, happens naturally
 								          discard
 								        of BlockError.UnviableFork:
 								          # Keep going so as to register other unviable blocks with the
 								          # quarantine
 								          if unviableBlock.isNone:
 								            # Remember the first unviable block, so we can log it
-												harden and speed up block sync (#3358)

* harden and speed up block sync

The `GetBlockBy*` server implementation currently reads SSZ bytes from
database, deserializes them into a Nim object then serializes them right
back to SSZ - here, we eliminate the deser/ser steps and send the bytes
straight to the network. Unfortunately, the snappy recoding must still
be done because of differences in framing.

Also, the quota system makes one giant request for quota right before
sending all blocks - this means that a 1024 block request will be
"paused" for a long time, then all blocks will be sent at once causing a
spike in database reads which potentially will see the reading client
time out before any block is sent.

Finally, on the reading side we make several copies of blocks as they
travel through various queues - this was not noticeable before but
becomes a problem in two cases: bellatrix blocks are up to 10mb (instead
of .. 30-40kb) and when backfilling, we process a lot more of them a lot
faster.

* fix status comparisons for nodes syncing from genesis (#3327 was a bit
too hard)
* don't hit database at all for post-altair slots in GetBlock v1
requests
											
										
										
											2022-02-07 17:20:10 +00:00
+								            unviableBlock = some((blk[].root, blk[].slot))
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
 								        of BlockError.Invalid:
 								          hasInvalidBlock = true
 								          let req = item.request
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								          warn "Received invalid sequence of blocks", request = req,
 								                blocks_count = len(item.data),
 								                blocks_map = getShortMap(req, item.data)
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								          req.item.updateScore(PeerScoreBadBlocks)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								          break
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								    # When errors happen while processing blocks, we retry the same request
 								    # with, hopefully, a different peer
 								    let retryRequest =
 								      hasInvalidBlock or unviableBlock.isSome() or missingParentSlot.isSome()
 								    if not retryRequest:
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								      let numSlotsAdvanced = item.request.count - sq.numAlreadyKnownSlots(sr)
 								      sq.advanceOutput(numSlotsAdvanced)
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
 								      if hasOkBlock:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								        # If there no error and response was not empty we should reward peer
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								        # with some bonus score - not for duplicate blocks though.
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								        item.request.item.updateScore(PeerScoreGoodBlocks)
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								      if numSlotsAdvanced > 0:
 								        sq.wakeupWaiters()
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    else:
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								      debug "Block pool rejected peer's response", request = item.request,
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								            blocks_map = getShortMap(item.request, item.data),
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								            blocks_count = len(item.data),
 								            ok = hasOkBlock,
 								            unviable = unviableBlock.isSome(),
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								            missing_parent = missingParentSlot.isSome()
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								      # We need to move failed response to the debts queue.
 								      sq.toDebtsQueue(item.request)
 								      if unviableBlock.isSome:
 								        let req = item.request
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								        notice "Received blocks from an unviable fork", request = req,
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								              blockRoot = unviableBlock.get()[0],
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								              blockSlot = unviableBlock.get()[1],
 								              blocks_count = len(item.data),
 								              blocks_map = getShortMap(req, item.data)
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								        req.item.updateScore(PeerScoreUnviableFork)
 								      if missingParentSlot.isSome:
 								        var
 								          resetSlot: Option[Slot]
 								          failSlot = missingParentSlot.get()
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
 								        # If we got `BlockError.MissingParent` it means that peer returns chain
 								        # of blocks with holes or `block_pool` is in incomplete state. We going
 								        # to rewind to the first slot at latest finalized epoch.
 								        let
 								          req = item.request
 								          safeSlot = sq.getSafeSlot()
 								        case sq.kind
 								        of SyncQueueKind.Forward:
 								          if safeSlot < req.slot:
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								            let rewindSlot = sq.getRewindPoint(failSlot, safeSlot)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								            warn "Unexpected missing parent, rewind happens",
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								                 request = req, rewind_to_slot = rewindSlot,
 								                 rewind_point = sq.rewind, finalized_slot = safeSlot,
 								                 blocks_count = len(item.data),
 								                 blocks_map = getShortMap(req, item.data)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								            resetSlot = some(rewindSlot)
 								            req.item.updateScore(PeerScoreMissingBlocks)
 								          else:
 								            error "Unexpected missing parent at finalized epoch slot",
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								                  request = req, rewind_to_slot = safeSlot,
 								                  blocks_count = len(item.data),
 								                  blocks_map = getShortMap(req, item.data)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								            req.item.updateScore(PeerScoreBadBlocks)
 								        of SyncQueueKind.Backward:
 								          if safeSlot > req.slot:
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								            let rewindSlot = sq.getRewindPoint(failSlot, safeSlot)
-												Backfiller (#3263)

Backfilling is the process of downloading historical blocks via P2P that
are required to fulfill `GetBlocksByRange` duties - this happens during
both trusted node and finalized checkpoint syncs.

In particular, backfilling happens after syncing to head, such that
attestation work can start as soon as possible.

* Fix SyncQueue initialization procedure.
Remove usage of `awaitne`.
Add cancellation support.
Remove unneeded `sleepAsync()` if peer's head is older than needed.
Add `direction` field to all logs.
Fix syncmanager wedge issue.
Add proper resource cleaning procedure on backward sync finish.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-01-20 07:25:45 +00:00
+								            # It's quite common peers give us fewer blocks than we ask for
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								            info "Gap in block range response, rewinding", request = req,
 								                 rewind_to_slot = rewindSlot, rewind_fail_slot = failSlot,
 								                 finalized_slot = safeSlot, blocks_count = len(item.data),
 								                 blocks_map = getShortMap(req, item.data)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								            resetSlot = some(rewindSlot)
 								            req.item.updateScore(PeerScoreMissingBlocks)
 								          else:
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								            error "Unexpected missing parent at safe slot", request = req,
 								                  to_slot = safeSlot, blocks_count = len(item.data),
 								                  blocks_map = getShortMap(req, item.data)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								            req.item.updateScore(PeerScoreBadBlocks)
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								        if resetSlot.isSome():
 								          await sq.resetWait(resetSlot)
 								          case sq.kind
 								          of SyncQueueKind.Forward:
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								            debug "Rewind to slot has happened", reset_slot = resetSlot.get(),
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								                  queue_input_slot = sq.inpSlot, queue_output_slot = sq.outSlot,
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								                  rewind_point = sq.rewind, direction = sq.kind
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								          of SyncQueueKind.Backward:
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								            debug "Rewind to slot has happened", reset_slot = resetSlot.get(),
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
+								                  queue_input_slot = sq.inpSlot, queue_output_slot = sq.outSlot,
-												Refactor and optimize sync logs. (#3451)

* Refactor and optimize logs.

* Introduce shortLog(SyncRequest).

* Address review comment.

* make sync queue logs more consistent

Adds a few minor logging improvements:
- Fixes a typo (`was happened` -> `has happened`)
- Avoids passing `reset_slot` argument to log statement multiple times
- Uses same `rewind_to_slot` label when logging in both sync directions
- Consistent rewind point logging

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2022-03-03 08:05:33 +00:00
+								                  direction = sq.kind
-												Harden handling of unviable forks (#3312)

* Harden handling of unviable forks

In our current handling of unviable forks, we allow peers to send us
blocks that come from a different fork - this is not necessarily an
error as it can happen naturally, but it does open up the client to a
case where the same unviable fork keeps getting requested - rather than
allowing this to happen, we'll now give these peers a small negative
score - if it keeps happening, we'll disconnect them.

* keep track of unviable forks in quarantine, to avoid filling it with
known junk
* collect peer scores in single module
* descore peers when they send unviable blocks during sync
* don't give score for duplicate blocks
* increase quarantine size to a level that allows finality to happen
under optimal conditions - this helps avoid downloading the same blocks
over and over in case of an unviable fork
* increase initial score for new peers to make room for one more failure
before disconnection
* log and score invalid/unviable blocks in requestmanager too
* avoid ChainDAG dependency in quarantine
* reject gossip blocks with unviable parent
* continue processing unviable sync blocks in order to build unviable
dag

* docs

* Update beacon_chain/consensus_object_pools/block_pools_types.nim

* add unviable queue test
											
										
										
											2022-01-26 12:20:08 +00:00
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								      break
 								proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T]) =
 								  ## Push failed request back to queue.
 								  if sr.index notin sq.pending:
 								    # If request `sr` not in our pending list, it only means that
 								    # SyncQueue.resetWait() happens and all pending requests are expired, so
 								    # we swallow `old` requests, and in such way sync-workers are able to get
 								    # proper new requests from SyncQueue.
 								    return
 								  sq.pending.del(sr.index)
 								  sq.toDebtsQueue(sr)
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								proc handlePotentialSafeSlotAdvancement[T](sq: SyncQueue[T]) =
 								  # It may happen that sync progress advanced to a newer `safeSlot`, either
 								  # by a response that started with good values and only had errors late, or
 								  # through an out-of-band mechanism, e.g., VC / REST.
 								  # If that happens, advance to the new `safeSlot` to avoid repeating requests
 								  # for data that is considered immutable and no longer relevant.
-												never request blocks before `safeSlot` in sync (#3512)

Follows up on https://github.com/status-im/nimbus-eth2/pull/3461 which
ensured that repeated `beaconBlocksByRange` requests get shrinked to
account for potential out-of-band advancements to `safeSlot`, with
similar logic for the initial request.
											
										
										
											2022-05-10 11:46:14 +00:00
+								  let safeSlot = sq.getSafeSlot()
 								  func numSlotsBehindSafeSlot(slot: Slot): uint64 =
 								    case sq.kind
 								    of SyncQueueKind.Forward:
 								      if safeSlot > slot:
 								        safeSlot - slot
 								      else:
 
 								    of SyncQueueKind.Backward:
 								      if slot > safeSlot:
 								        slot - safeSlot
 								      else:
 
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								  let
-												never request blocks before `safeSlot` in sync (#3512)

Follows up on https://github.com/status-im/nimbus-eth2/pull/3461 which
ensured that repeated `beaconBlocksByRange` requests get shrinked to
account for potential out-of-band advancements to `safeSlot`, with
similar logic for the initial request.
											
										
										
											2022-05-10 11:46:14 +00:00
+								    numOutSlotsAdvanced = sq.outSlot.numSlotsBehindSafeSlot
 								    numInpSlotsAdvanced =
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								      case sq.kind
 								      of SyncQueueKind.Forward:
-												never request blocks before `safeSlot` in sync (#3512)

Follows up on https://github.com/status-im/nimbus-eth2/pull/3461 which
ensured that repeated `beaconBlocksByRange` requests get shrinked to
account for potential out-of-band advancements to `safeSlot`, with
similar logic for the initial request.
											
										
										
											2022-05-10 11:46:14 +00:00
+								        sq.inpSlot.numSlotsBehindSafeSlot
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								      of SyncQueueKind.Backward:
-												never request blocks before `safeSlot` in sync (#3512)

Follows up on https://github.com/status-im/nimbus-eth2/pull/3461 which
ensured that repeated `beaconBlocksByRange` requests get shrinked to
account for potential out-of-band advancements to `safeSlot`, with
similar logic for the initial request.
											
										
										
											2022-05-10 11:46:14 +00:00
+								        if sq.inpSlot == 0xFFFF_FFFF_FFFF_FFFF'u64:
 'u64
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								        else:
-												never request blocks before `safeSlot` in sync (#3512)

Follows up on https://github.com/status-im/nimbus-eth2/pull/3461 which
ensured that repeated `beaconBlocksByRange` requests get shrinked to
account for potential out-of-band advancements to `safeSlot`, with
similar logic for the initial request.
											
										
										
											2022-05-10 11:46:14 +00:00
+								          sq.inpSlot.numSlotsBehindSafeSlot
 								  if numOutSlotsAdvanced != 0 or numInpSlotsAdvanced != 0:
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								    debug "Sync progress advanced out-of-band",
-												never request blocks before `safeSlot` in sync (#3512)

Follows up on https://github.com/status-im/nimbus-eth2/pull/3461 which
ensured that repeated `beaconBlocksByRange` requests get shrinked to
account for potential out-of-band advancements to `safeSlot`, with
similar logic for the initial request.
											
										
										
											2022-05-10 11:46:14 +00:00
+								      safeSlot, outSlot = sq.outSlot, inpSlot = sq.inpSlot
 								    if numOutSlotsAdvanced != 0:
 								      sq.advanceOutput(numOutSlotsAdvanced)
 								    if numInpSlotsAdvanced != 0:
 								      sq.advanceInput(numInpSlotsAdvanced)
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								    sq.wakeupWaiters()
 								func updateRequestForNewSafeSlot[T](sq: SyncQueue[T], sr: var SyncRequest[T]) =
 								  # Requests may have originated before the latest `safeSlot` advancement.
 								  # Update it to not request any data prior to `safeSlot`.
 								  let
 								    outSlot = sq.outSlot
 								    lowSlot = sr.slot
 								    highSlot = sr.lastSlot
 								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    if outSlot <= lowSlot:
 								      # Entire request is still relevant.
 								      discard
 								    elif outSlot <= highSlot:
 								      # Request is only partially relevant.
 								      let
 								        numSlotsDone = outSlot - lowSlot
 								        numStepsDone = (numSlotsDone + sr.step - 1) div sr.step
 								      sr.slot += numStepsDone * sr.step
 								      sr.count -= numStepsDone
 								    else:
 								      # Entire request is no longer relevant.
 								      sr.step = 0
 								      sr.count = 0
 								  of SyncQueueKind.Backward:
 								    if outSlot >= highSlot:
 								      # Entire request is still relevant.
 								      discard
 								    elif outSlot >= lowSlot:
 								      # Request is only partially relevant.
 								      let
 								        numSlotsDone = highSlot - outSlot
 								        numStepsDone = (numSlotsDone + sr.step - 1) div sr.step
 								      sr.count -= numStepsDone
 								    else:
 								      # Entire request is no longer relevant.
 								      sr.step = 0
 								      sr.count = 0
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								proc pop*[T](sq: SyncQueue[T], maxslot: Slot, item: T): SyncRequest[T] =
 								  ## Create new request according to current SyncQueue parameters.
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								  sq.handlePotentialSafeSlotAdvancement()
 								  while len(sq.debtsQueue) > 0:
-												enable `styleCheck:usages` (#3573)

Some upstream repos still need fixes, but this gets us close enough that
style hints can be enabled by default.

In general, "canonical" spellings are preferred even if they violate
nep-1 - this applies in particular to spec-related stuff like
`genesis_validators_root` which appears throughout the codebase.

											
										
										
											2022-04-08 16:22:49 +00:00
+								    if maxslot < sq.debtsQueue[0].slot:
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								      # Peer's latest slot is less than starting request's slot.
 								      return SyncRequest.empty(sq.kind, T)
-												enable `styleCheck:usages` (#3573)

Some upstream repos still need fixes, but this gets us close enough that
style hints can be enabled by default.

In general, "canonical" spellings are preferred even if they violate
nep-1 - this applies in particular to spec-related stuff like
`genesis_validators_root` which appears throughout the codebase.

											
										
										
											2022-04-08 16:22:49 +00:00
+								    if maxslot < sq.debtsQueue[0].lastSlot():
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								      # Peer's latest slot is less than finishing request's slot.
 								      return SyncRequest.empty(sq.kind, T)
 								    var sr = sq.debtsQueue.pop()
 								    sq.debtsCount = sq.debtsCount - sr.count
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								    sq.updateRequestForNewSafeSlot(sr)
 								    if sr.isEmpty:
 								      continue
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    sr.setItem(item)
 								    sq.makePending(sr)
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								    return sr
 								  case sq.kind
 								  of SyncQueueKind.Forward:
-												enable `styleCheck:usages` (#3573)

Some upstream repos still need fixes, but this gets us close enough that
style hints can be enabled by default.

In general, "canonical" spellings are preferred even if they violate
nep-1 - this applies in particular to spec-related stuff like
`genesis_validators_root` which appears throughout the codebase.

											
										
										
											2022-04-08 16:22:49 +00:00
+								    if maxslot < sq.inpSlot:
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								      # Peer's latest slot is less than queue's input slot.
 								      return SyncRequest.empty(sq.kind, T)
 								    if sq.inpSlot > sq.finalSlot:
 								      # Queue's input slot is bigger than queue's final slot.
 								      return SyncRequest.empty(sq.kind, T)
 								    let lastSlot = min(maxslot, sq.finalSlot)
 								    let count = min(sq.chunkSize, lastSlot + 1'u64 - sq.inpSlot)
 								    var sr = SyncRequest.init(sq.kind, sq.inpSlot, count, item)
 								    sq.advanceInput(count)
 								    sq.makePending(sr)
 								    sr
 								  of SyncQueueKind.Backward:
 								    if sq.inpSlot == 0xFFFF_FFFF_FFFF_FFFF'u64:
 								      return SyncRequest.empty(sq.kind, T)
 								    if sq.inpSlot < sq.finalSlot:
 								      return SyncRequest.empty(sq.kind, T)
 								    let (slot, count) =
 								      block:
 								        let baseSlot = sq.inpSlot + 1'u64
 								        if baseSlot - sq.finalSlot < sq.chunkSize:
 								          let count = uint64(baseSlot - sq.finalSlot)
 								          (baseSlot - count, count)
 								        else:
 								          (baseSlot - sq.chunkSize, sq.chunkSize)
-												enable `styleCheck:usages` (#3573)

Some upstream repos still need fixes, but this gets us close enough that
style hints can be enabled by default.

In general, "canonical" spellings are preferred even if they violate
nep-1 - this applies in particular to spec-related stuff like
`genesis_validators_root` which appears throughout the codebase.

											
										
										
											2022-04-08 16:22:49 +00:00
+								    if (maxslot + 1'u64) < slot + count:
-												avoid re-requesting finalized blocks during sync (#3461)

When a `beaconBlocksByRange` response advances the `safeSlot`, but later
has errors, the sync queue keeps repeating that same request until it is
fulfilled without errors. Data up through `safeSlot` is considered to be
immutable, i.e., finalized, so re-requesting that data is not useful.
By advancing the sync progress in that scenario, those redundant query
portions can be avoided. Note, the finalized block _itself_ is always
requested, even in the initial request. This behaviour is kept same.
											
										
										
											2022-03-15 17:56:56 +00:00
+								      # Peer's latest slot is less than queue's input slot.
 								      return SyncRequest.empty(sq.kind, T)
 								    var sr = SyncRequest.init(sq.kind, slot, count, item)
 								    sq.advanceInput(count)
 								    sq.makePending(sr)
-												Backward sync support for SyncManager. (#3131)

* Unbundle SyncQueue from sync_manager.nim.
Unbundle Peer scores constants to peer_scores.nim.
Add Forward/Backward enum.

* Further improvements and tests.

* Adopt getRewindPoint() and fix MissingParent handler.

* Remove unused procedures.
Refactor `result` usage.
Fix resetWait().

* Add all the tests and fix the issue with rewind point.

* Fix get() issue.

* Fix flaky tests.

* test fixes

Co-authored-by: Jacek Sieka <jacek@status.im>
											
										
										
											2021-12-08 21:15:29 +00:00
+								    sr
 								proc debtLen*[T](sq: SyncQueue[T]): uint64 =
 								  sq.debtsCount
 								proc pendingLen*[T](sq: SyncQueue[T]): uint64 =
 								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    # When moving forward `outSlot` will be <= of `inpSlot`.
 								    sq.inpSlot - sq.outSlot
 								  of SyncQueueKind.Backward:
 								    # When moving backward `outSlot` will be >= of `inpSlot`
 								    sq.outSlot - sq.inpSlot
 								proc len*[T](sq: SyncQueue[T]): uint64 {.inline.} =
 								  ## Returns number of slots left in queue ``sq``.
 								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    sq.finalSlot + 1'u64 - sq.outSlot
 								  of SyncQueueKind.Backward:
 								    sq.outSlot + 1'u64 - sq.finalSlot
 								proc total*[T](sq: SyncQueue[T]): uint64 {.inline.} =
 								  ## Returns total number of slots in queue ``sq``.
 								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    sq.finalSlot + 1'u64 - sq.startSlot
 								  of SyncQueueKind.Backward:
 								    sq.startSlot + 1'u64 - sq.finalSlot
 								proc progress*[T](sq: SyncQueue[T]): uint64 =
-												SyncManager cleanups for backfill support (#3189)

* SyncManager cleanups for backfill support

Cleanups, fixes and simplifications, in anticipation of backfill support
for the `SyncManager`:

* reformat sync progress indicator to show time left and % done more
prominently:
  * old: `sync="sPssPsssss:2:2.4229:00h57m (2706898)"`
  * new: `sync="14d12h31m (0.52%) 1.1378slots/s (wQQQQQDDQQ:1287520)"`
* reset average speed when going out of sync
* pass all block errors to sync manager, including duplicate/unviable
* penalize peers for reporting a head block that is outside of our
expected wall clock time (they're likely on a different network or
trying to disrupt sync)
* remove `SyncFailureKind` (unused)
* remove `inRange` (unused)
* add `Q` for sync queue requests that are in the `SyncQueue` but not
yet in the `BlockProcessor` queue
* update last slot in `SyncQueue` after getting peer status
* fix race condition between `wakeupWaiters` and `resetWait`, where
workers would not be correctly reset if block verification returned a
completed future without event loop
* log syncmanager direction

* Fix ordering issue.
Some of the requests size of which are not equal to `chunkSize` could be processed in wrong order which could lead to sync process freezes.

Co-authored-by: cheatfate <eugene.kabanov@status.im>
											
										
										
											2021-12-16 14:57:16 +00:00
+								  ## How many slots we've synced so far
 								  case sq.kind
 								  of SyncQueueKind.Forward:
 								    sq.outSlot - sq.startSlot
 								  of SyncQueueKind.Backward:
 								    sq.startSlot - sq.outSlot