Fix trie inspector for healing (#1284)

* Update logging * Fix node hash associated with partial path for missing nodes why: Healing uses the partial paths for fetching nodes from the network. The node hash (or key) is used to verify the node data retrieved. The trie inspector function returned the parent hash instead of the node hash with the partial path when a missing node was detected. So all nodes for healing were rejected. * Must not modify sequence while looping over it
2022-10-28 08:26:17 +01:00 · 2022-10-28 08:26:17 +01:00 · a8df4c1165
parent 1b4572ed3b
commit a8df4c1165
9 changed files with 66 additions and 49 deletions
--- a/nimbus/sync/snap/range_desc.nim
+++ b/nimbus/sync/snap/range_desc.nim
@ -118,6 +118,10 @@ proc to*(n: SomeUnsignedInt|UInt256; T: type NodeTag): T =
  n.u256.T


+proc digestTo*(data: Blob; T: type NodeKey): T =
+  keccakHash(data).data.T
+
+
 proc hash*(a: NodeKey): Hash =
  ## Table/KeyedQueue mixin
  a.ByteArray32.hash
@ -293,6 +297,26 @@ proc `$`*(a, b: NodeTag): string =
 proc `$`*(iv: NodeTagRange): string =
  leafRangePp(iv.minPt, iv.maxPt)

+proc `$`*(n: NodeSpecs): string =
+  ## Prints `(path,key,node-hash)`
+  let nHash = if n.data.len == 0: NodeKey.default
+              else: n.data.digestTo(NodeKey)
+  result = "("
+  if n.partialPath.len != 0:
+    result &= n.partialPath.toHex
+  result &= ","
+  if n.nodeKey != NodeKey.default:
+    result &= $n.nodeKey
+    if n.nodeKey != nHash:
+      result &= "(!)"
+  result &= ","
+  if nHash != NodeKey.default:
+    if n.nodeKey != nHash:
+      result &= $nHash
+    else:
+      result &= "ditto"
+  result &= ")"
+
 # ------------------------------------------------------------------------------
 # End
 # ------------------------------------------------------------------------------
--- a/nimbus/sync/snap/worker/db/hexary_desc.nim
+++ b/nimbus/sync/snap/worker/db/hexary_desc.nim
@ -378,9 +378,6 @@ proc isZero*[T: NodeTag|NodeKey|RepairKey](a: T): bool =
 proc isNodeKey*(a: RepairKey): bool =
  a.ByteArray33[0] == 0

-proc digestTo*(data: Blob; T: type NodeKey): T =
-  keccakHash(data).data.T
-
 proc convertTo*(data: Blob; T: type NodeKey): T =
  ## Probably lossy conversion, use `init()` for safe conversion
  discard result.init(data)
--- a/nimbus/sync/snap/worker/db/hexary_inspect.nim
+++ b/nimbus/sync/snap/worker/db/hexary_inspect.nim
@ -116,7 +116,6 @@ proc processLink(
    db: HexaryTreeDbRef;
    stats: var TrieNodeStat;
    inspect: TableRef[RepairKey,NibblesSeq];
-    parent: NodeKey;
    trail: NibblesSeq;
    child: RepairKey;
      ) {.gcsafe, raises: [Defect,KeyError]} =
@ -126,35 +125,29 @@ proc processLink(
      # Oops -- caught in the middle of a repair process? Just register
      # this node
      stats.dangling.add NodeSpecs(
-        partialPath: trail.hexPrefixEncode(isLeaf = false),
-        nodeKey:     parent)
+        partialPath: trail.hexPrefixEncode(isLeaf = false))
    elif db.tab.hasKey(child):
      inspect[child] = trail
    else:
      stats.dangling.add NodeSpecs(
        partialPath: trail.hexPrefixEncode(isLeaf = false),
-        nodeKey:     parent)
+        nodeKey:     child.convertTo(NodeKey))

 proc processLink(
    getFn: HexaryGetFn;
    stats: var TrieNodeStat;
    inspect: TableRef[NodeKey,NibblesSeq];
-    parent: NodeKey;
    trail: NibblesSeq;
    child: Rlp;
      ) {.gcsafe, raises: [Defect,RlpError,KeyError]} =
  ## Ditto
  if not child.isEmpty:
-    let
-      #parentKey = parent.convertTo(NodeKey)
-      childBlob = child.toBytes
-
+    let childBlob = child.toBytes
    if childBlob.len != 32:
      # Oops -- that is wrong, although the only sensible action is to
      # register the node and otherwise ignore it
      stats.dangling.add NodeSpecs(
-        partialPath: trail.hexPrefixEncode(isLeaf = false),
-        nodeKey:     parent)
+        partialPath: trail.hexPrefixEncode(isLeaf = false))
    else:
      let childKey = childBlob.convertTo(NodeKey)
      if 0 < child.toBytes.getFn().len:
@ -162,7 +155,7 @@ proc processLink(
      else:
        stats.dangling.add NodeSpecs(
          partialPath: trail.hexPrefixEncode(isLeaf = false),
-          nodeKey:     parent)
+          nodeKey:     childKey)

 # ------------------------------------------------------------------------------
 # Public functions
@ -261,13 +254,13 @@ proc hexaryInspectTrie*(
        let
          trail = parentTrail & node.ePfx
          child = node.eLink
-        db.processLink(stats=result, inspect=again, parent, trail, child)
+        db.processLink(stats=result, inspect=again, trail, child)
      of Branch:
        for n in 0 ..< 16:
          let
            trail = parentTrail & @[n.byte].initNibbleRange.slice(1)
            child = node.bLink[n]
-          db.processLink(stats=result, inspect=again, parent, trail, child)
+          db.processLink(stats=result, inspect=again, trail, child)
      of Leaf:
        # Ooops, forget node and key
        discard
@ -335,13 +328,13 @@ proc hexaryInspectTrie*(
          let
            trail = parentTrail & xPfx
            child = nodeRlp.listElem(1)
-          getFn.processLink(stats=result, inspect=again, parent, trail, child)
+          getFn.processLink(stats=result, inspect=again, trail, child)
      of 17:
        for n in 0 ..< 16:
          let
            trail = parentTrail & @[n.byte].initNibbleRange.slice(1)
            child = nodeRlp.listElem(n)
-          getFn.processLink(stats=result, inspect=again, parent, trail, child)
+          getFn.processLink(stats=result, inspect=again, trail, child)
      else:
        # Ooops, forget node and key
        discard
--- a/nimbus/sync/snap/worker/db/snapdb_accounts.nim
+++ b/nimbus/sync/snap/worker/db/snapdb_accounts.nim
@ -213,7 +213,7 @@ proc importAccounts*(
  except KeyError as e:
    raiseAssert "Not possible @ importAccounts: " & e.msg
  except OSError as e:
-    trace "Import Accounts exception", peer=ps.peer, name=($e.name), msg=e.msg
+    error "Import Accounts exception", peer=ps.peer, name=($e.name), msg=e.msg
    return err(OSErrorException)

  when extraTraceMessages:
@ -262,10 +262,10 @@ proc importRawAccountsNodes*(
    slot: Option[int]
  try:
    # Import nodes
-    for n,rec in nodes:
-      if 0 < rec.data.len: # otherwise ignore empty placeholder
+    for n,node in nodes:
+      if 0 < node.data.len: # otherwise ignore empty placeholder
        slot = some(n)
-        var rep = db.hexaryImport(rec)
+        var rep = db.hexaryImport(node)
        if rep.error != NothingSerious:
          rep.slot = slot
          result.add rep
@ -293,7 +293,7 @@ proc importRawAccountsNodes*(
  except OSError as e:
    result.add HexaryNodeReport(slot: slot, error: OSErrorException)
    nErrors.inc
-    trace "Import account nodes exception", peer, slot, nItems,
+    error "Import account nodes exception", peer, slot, nItems,
      name=($e.name), msg=e.msg, nErrors

  when extraTraceMessages:
--- a/nimbus/sync/snap/worker/db/snapdb_storage_slots.nim
+++ b/nimbus/sync/snap/worker/db/snapdb_storage_slots.nim
@ -248,7 +248,7 @@ proc importStorageSlots*(
      raiseAssert "Not possible @ importStorages: " & e.msg
    except OSError as e:
      result.add HexaryNodeReport(slot: itemInx, error: OSErrorException)
-      trace "Import storage slots exception", peer, itemInx, nItems,
+      error "Import storage slots exception", peer, itemInx, nItems,
        name=($e.name), msg=e.msg, nErrors=result.len

  when extraTraceMessages:
@ -295,10 +295,10 @@ proc importRawStorageSlotsNodes*(
    slot: Option[int]
  try:
    # Import nodes
-    for n,rec in nodes:
-      if 0 < rec.data.len: # otherwise ignore empty placeholder
+    for n,node in nodes:
+      if 0 < node.data.len: # otherwise ignore empty placeholder
        slot = some(n)
-        var rep = db.hexaryImport(rec)
+        var rep = db.hexaryImport(node)
        if rep.error != NothingSerious:
          rep.slot = slot
          result.add rep
@ -326,7 +326,7 @@ proc importRawStorageSlotsNodes*(
  except OSError as e:
    result.add HexaryNodeReport(slot: slot, error: OSErrorException)
    nErrors.inc
-    trace "Import storage slots nodes exception", peer, slot, nItems,
+    error "Import storage slots nodes exception", peer, slot, nItems,
      name=($e.name), msg=e.msg, nErrors

  when extraTraceMessages:
--- a/nimbus/sync/snap/worker/heal_accounts.nim
+++ b/nimbus/sync/snap/worker/heal_accounts.nim
@ -158,6 +158,7 @@ proc updateMissingNodesList(buddy: SnapBuddyRef) =
    env = buddy.data.pivotEnv
    stateRoot = env.stateHeader.stateRoot

+  var delayed: seq[NodeSpecs]
  for w in env.fetchAccounts.missingNodes:
    let rc = db.getAccountsNodeKey(peer, stateRoot, w.partialPath)
    if rc.isOk:
@ -165,7 +166,10 @@ proc updateMissingNodesList(buddy: SnapBuddyRef) =
      env.fetchAccounts.checkNodes.add w.partialPath
    else:
      # Node is still missing
-      env.fetchAccounts.missingNodes.add w
+      delayed.add w
+
+  # Must not modify sequence while looping over it
+  env.fetchAccounts.missingNodes = env.fetchAccounts.missingNodes & delayed


 proc appendMoreDanglingNodesToMissingNodesList(buddy: SnapBuddyRef): bool =
@ -247,12 +251,12 @@ proc getMissingNodesFromNetwork(
  if await buddy.ctrl.stopAfterSeriousComError(error, buddy.data.errors):
    discard
    when extraTraceMessages:
-      trace logTxt "error fetching nodes => stop", peer,
+      trace logTxt "fetch nodes error => stop", peer,
        ctx=buddy.healingCtx(), error
  else:
    discard
    when extraTraceMessages:
-      trace logTxt "error fetching nodes", peer,
+      trace logTxt "fetch nodes error", peer,
        ctx=buddy.healingCtx(), error

  return @[]
@ -311,10 +315,6 @@ proc registerAccountLeaf(
      acckey:      accKey,
      storageRoot: acc.storageRoot)

-  when extraTraceMessages:
-    trace logTxt "single node", peer,
-      ctx=buddy.healingCtx(), accKey=pt
-
 # ------------------------------------------------------------------------------
 # Public functions
 # ------------------------------------------------------------------------------
@ -379,6 +379,7 @@ proc healAccounts*(buddy: SnapBuddyRef) {.async.} =
      ctx=buddy.healingCtx(), nNodes=nodeSpecs.len

  # Filter out error and leaf nodes
+  var nLeafNodes = 0 # for logging
  for w in report:
    if w.slot.isSome: # non-indexed entries appear typically at the end, though
      let
@ -399,11 +400,12 @@ proc healAccounts*(buddy: SnapBuddyRef) {.async.} =
        if isLeaf:
          # Update `uprocessed` registry, collect storage roots (if any)
          buddy.registerAccountLeaf(key, acc)
+          nLeafNodes.inc
        else:
          env.fetchAccounts.checkNodes.add nodePath

  when extraTraceMessages:
-    trace logTxt "job done", peer, ctx=buddy.healingCtx()
+    trace logTxt "job done", peer, ctx=buddy.healingCtx(), nLeafNodes

 # ------------------------------------------------------------------------------
 # End
--- a/nimbus/sync/snap/worker/heal_storage_slots.nim
+++ b/nimbus/sync/snap/worker/heal_storage_slots.nim
@ -97,6 +97,7 @@ proc updateMissingNodesList(
    storageRoot = kvp.key
    slots = kvp.data.slots

+  var delayed: seq[NodeSpecs]
  for w in slots.missingNodes:
    let rc = db.getStorageSlotsNodeKey(peer, accKey, storageRoot, w.partialPath)
    if rc.isOk:
@ -104,7 +105,10 @@ proc updateMissingNodesList(
      slots.checkNodes.add w.partialPath
    else:
      # Node is still missing
-      slots.missingNodes.add w
+      delayed.add w
+
+  # Must not modify sequence while looping over it
+  slots.missingNodes = slots.missingNodes & delayed


 proc appendMoreDanglingNodesToMissingNodesList(
@ -197,13 +201,13 @@ proc getMissingNodesFromNetwork(
  if await buddy.ctrl.stopAfterSeriousComError(error, buddy.data.errors):
    discard
    when extraTraceMessages:
-      trace logTxt "error fetching nodes => stop", peer,
+      trace logTxt "fetch nodes error => stop", peer,
        itCtx=buddy.healingCtx(kvp), nSlotLists=env.nSlotLists,
        nStorageQueue=env.fetchStorage.len, error
  else:
    discard
    when extraTraceMessages:
-      trace logTxt "error fetching nodes", peer,
+      trace logTxt "fetch nodes error", peer,
        itCtx=buddy.healingCtx(kvp), nSlotLists=env.nSlotLists,
        nStorageQueue=env.fetchStorage.len, error

@ -228,11 +232,6 @@ proc kvStorageSlotsLeaf(
  if nibbles.len == 64:
    return (true, nibbles.getBytes.convertTo(NodeKey))

-  when extraTraceMessages:
-    trace logTxt "non-leaf node path", peer,
-      itCtx=buddy.healingCtx(kvp), nSlotLists=env.nSlotLists,
-      nStorageQueue=env.fetchStorage.len, nNibbles=nibbles.len
-

 proc registerStorageSlotsLeaf(
    buddy: SnapBuddyRef;
@ -321,6 +320,7 @@ proc storageSlotsHealing(
      nStorageQueue=env.fetchStorage.len, nNodes=nodeSpecs.len

  # Filter out error and leaf nodes
+  var nLeafNodes = 0 # for logging
  for w in report:
    if w.slot.isSome: # non-indexed entries appear typically at the end, though
      let
@ -342,13 +342,14 @@ proc storageSlotsHealing(
        if isLeaf:
          # Update `uprocessed` registry, collect storage roots (if any)
          buddy.registerStorageSlotsLeaf(kvp, slotKey)
+          nLeafNodes.inc
        else:
          slots.checkNodes.add nodePath

  when extraTraceMessages:
    trace logTxt "job done", peer,
      itCtx=buddy.healingCtx(kvp), nSlotLists=env.nSlotLists,
-      nStorageQueue=env.fetchStorage.len
+      nStorageQueue=env.fetchStorage.len, nLeafNodes


 proc healingIsComplete(
--- a/nimbus/sync/snap/worker/range_fetch_accounts.nim
+++ b/nimbus/sync/snap/worker/range_fetch_accounts.nim
@ -130,7 +130,7 @@ proc rangeFetchAccounts*(buddy: SnapBuddyRef) {.async.} =
      let error = rc.error
      if await buddy.ctrl.stopAfterSeriousComError(error, buddy.data.errors):
        when extraTraceMessages:
-          trace logTxt "fetching error => stop", peer,
+          trace logTxt "fetch error => stop", peer,
            stateRoot, req=iv.len, error
      return
    # Reset error counts for detecting repeated timeouts
--- a/nimbus/sync/snap/worker/range_fetch_storage_slots.nim
+++ b/nimbus/sync/snap/worker/range_fetch_storage_slots.nim
@ -115,7 +115,7 @@ proc getNextSlotItems(
            env.fetchStorage.del(reqKey)

          when extraTraceMessages:
-            trace logTxt "prepare fetching partial", peer,
+            trace logTxt "prepare fetch partial", peer,
              nSlotLists=env.nSlotLists, nStorageQueue=env.fetchStorage.len,
              nToProcess=1, subRange=rc.value, account=reqData.accKey

@ -189,7 +189,7 @@ proc storeStoragesSingleBatch(
      let error = rc.error
      if await buddy.ctrl.stopAfterSeriousComError(error, buddy.data.errors):
        discard
-        trace logTxt "fetching error => stop", peer,
+        trace logTxt "fetch error => stop", peer,
          nSlotLists=env.nSlotLists, nReq=req.len,
          nStorageQueue=env.fetchStorage.len, error
      return