Lower the content deletion fraction and other contentdb clean-up (#1893)

- The current setting of 0.25 was very big. Set to 0.5, which is
potentially still large. This change did expose some issues with
the current implementation and especially testing.
- General clean-up, renaming for consistency, and
re-ordering/restructuring/deletion of some code.
- Fixed several typos
- ...
This commit is contained in:
Kim De Mey 2023-11-16 15:27:30 +01:00 committed by GitHub
parent 86f93fdba5
commit 6338969dd5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 107 additions and 98 deletions

View File

@ -33,24 +33,23 @@ export kvstore_sqlite3
# thus depending on the network the right db needs to be selected.
declareCounter portal_pruning_counter,
"Number of pruning event which happened during node lifetime",
"Number of pruning events which occured during the node's uptime",
labels = ["protocol_id"]
declareGauge portal_pruning_deleted_elements,
"Number of elements delted in last pruning",
"Number of elements deleted in the last pruning",
labels = ["protocol_id"]
const
contentDeletionFraction = 0.05 ## 5% of the content will be deleted when the
## storage capacity is hit and radius gets adjusted.
type
RowInfo = tuple
contentId: array[32, byte]
payloadLength: int64
distance: array[32, byte]
ObjInfo* = object
contentId*: array[32, byte]
payloadLength*: int64
distFrom*: UInt256
ContentDB* = ref object
kv: KvStoreRef
storageCapacity*: uint64
@ -69,9 +68,9 @@ type
of ContentStored:
discard
of DbPruned:
furthestStoredElementDistance*: UInt256
fractionOfDeletedContent*: float64
numOfDeletedElements*: int64
distanceOfFurthestElement*: UInt256
deletedFraction*: float64
deletedElements*: int64
func xorDistance(
a: openArray[byte],
@ -181,10 +180,38 @@ proc del(db: ContentDB, key: openArray[byte]) =
# TODO: Do we want to return the bool here too?
discard db.kv.del(key).expectDb()
proc getSszDecoded*(
proc getSszDecoded(
db: ContentDB, key: openArray[byte], T: type auto): Opt[T] =
db.kv.getSszDecoded(key, T)
## Public ContentId based ContentDB calls
# TODO: Could also decide to use the ContentKey SSZ bytestring, as this is what
# gets send over the network in requests, but that would be a bigger key. Or the
# same hashing could be done on it here.
# However ContentId itself is already derived through different digests
# depending on the content type, and this ContentId typically needs to be
# checked with the Radius/distance of the node anyhow. So lets see how we end up
# using this mostly in the code.
proc get*(db: ContentDB, key: ContentId): Opt[seq[byte]] =
# TODO: Here it is unfortunate that ContentId is a uint256 instead of Digest256.
db.get(key.toBytesBE())
proc put*(db: ContentDB, key: ContentId, value: openArray[byte]) =
db.put(key.toBytesBE(), value)
proc contains*(db: ContentDB, key: ContentId): bool =
db.contains(key.toBytesBE())
proc del*(db: ContentDB, key: ContentId) =
db.del(key.toBytesBE())
proc getSszDecoded*(db: ContentDB, key: ContentId, T: type auto): Opt[T] =
db.getSszDecoded(key.toBytesBE(), T)
## Public database size, content and pruning related calls
proc reclaimSpace*(db: ContentDB): void =
## Runs sqlite VACUUM commands which rebuilds the db, repacking it into a
## minimal amount of disk space.
@ -233,39 +260,15 @@ proc contentCount*(db: ContentDB): int64 =
count = res).expectDb()
return count
## Public ContentId based ContentDB calls
# TODO: Could also decide to use the ContentKey SSZ bytestring, as this is what
# gets send over the network in requests, but that would be a bigger key. Or the
# same hashing could be done on it here.
# However ContentId itself is already derived through different digests
# depending on the content type, and this ContentId typically needs to be
# checked with the Radius/distance of the node anyhow. So lets see how we end up
# using this mostly in the code.
proc get*(db: ContentDB, key: ContentId): Opt[seq[byte]] =
# TODO: Here it is unfortunate that ContentId is a uint256 instead of Digest256.
db.get(key.toBytesBE())
proc put*(db: ContentDB, key: ContentId, value: openArray[byte]) =
db.put(key.toBytesBE(), value)
proc contains*(db: ContentDB, key: ContentId): bool =
db.contains(key.toBytesBE())
proc del*(db: ContentDB, key: ContentId) =
db.del(key.toBytesBE())
proc getSszDecoded*(db: ContentDB, key: ContentId, T: type auto): Opt[T] =
db.getSszDecoded(key.toBytesBE(), T)
proc deleteContentFraction(
proc deleteContentFraction*(
db: ContentDB,
target: UInt256,
fraction: float64): (UInt256, int64, int64, int64) =
## Deletes at most `fraction` percent of content form database.
## First, content furthest from provided `target` is deleted.
## Content furthest from provided `target` is deleted first.
# TODO: The usage of `db.contentSize()` for the deletion calculation versus
# `db.usedSize()` for the pruning threshold leads sometimes to some unexpected
# results of how much content gets up deleted.
doAssert(
fraction > 0 and fraction < 1,
"Deleted fraction should be > 0 and < 1"
@ -273,22 +276,22 @@ proc deleteContentFraction(
let totalContentSize = db.contentSize()
let bytesToDelete = int64(fraction * float64(totalContentSize))
var numOfDeletedElements: int64 = 0
var deletedElements: int64 = 0
var ri: RowInfo
var bytesDeleted: int64 = 0
var deletedBytes: int64 = 0
let targetBytes = target.toBytesBE()
for e in db.getAllOrderedByDistanceStmt.exec(targetBytes, ri):
if bytesDeleted + ri.payloadLength < bytesToDelete:
if deletedBytes + ri.payloadLength <= bytesToDelete:
db.del(ri.contentId)
bytesDeleted = bytesDeleted + ri.payloadLength
inc numOfDeletedElements
deletedBytes = deletedBytes + ri.payloadLength
inc deletedElements
else:
return (
UInt256.fromBytesBE(ri.distance),
bytesDeleted,
deletedBytes,
totalContentSize,
numOfDeletedElements
deletedElements
)
proc put*(
@ -296,7 +299,6 @@ proc put*(
key: ContentId,
value: openArray[byte],
target: UInt256): PutResult =
db.put(key, value)
# The used size is used as pruning threshold. This means that the database
@ -315,55 +317,46 @@ proc put*(
if dbSize < int64(db.storageCapacity):
return PutResult(kind: ContentStored)
else:
# TODO Add some configuration for this magic number
let (
furthestNonDeletedElement,
distanceOfFurthestElement,
deletedBytes,
totalContentSize,
deletedElements
) =
db.deleteContentFraction(target, 0.25)
db.deleteContentFraction(target, contentDeletionFraction)
let deletedFraction = float64(deletedBytes) / float64(totalContentSize)
info "Deleted content fraction", deletedBytes, deletedElements, deletedFraction
return PutResult(
kind: DbPruned,
furthestStoredElementDistance: furthestNonDeletedElement,
fractionOfDeletedContent: deletedFraction,
numOfDeletedElements: deletedElements)
distanceOfFurthestElement: distanceOfFurthestElement,
deletedFraction: deletedFraction,
deletedElements: deletedElements)
proc adjustRadius(
p: PortalProtocol,
fractionOfDeletedContent: float64,
furthestElementInDbDistance: UInt256) =
if fractionOfDeletedContent == 0.0:
# even though pruning was triggered no content was deleted, it could happen
# in pathological case of really small database with really big values.
# log it as error as it should not happenn
error "Database pruning attempt resulted in no content deleted"
return
# we need to invert fraction as our Uin256 implementation does not support
deletedFraction: float64,
distanceOfFurthestElement: UInt256) =
# Invert fraction as the UInt256 implementation does not support
# multiplication by float
let invertedFractionAsInt = int64(1.0 / fractionOfDeletedContent)
let invertedFractionAsInt = int64(1.0 / deletedFraction)
let scaledRadius = p.dataRadius div u256(invertedFractionAsInt)
# Chose larger value to avoid situation, where furthestElementInDbDistance
# is super close to local id, so local radius would end up too small
# to accept any more data to local database
# If scaledRadius radius will be larger it will still contain all elements
let newRadius = max(scaledRadius, furthestElementInDbDistance)
# Choose a larger value to avoid the situation where the
# `distanceOfFurthestElement is very close to the local id so that the local
# radius would end up too small to accept any more data to the database.
# If scaledRadius radius will be larger it will still contain all elements.
let newRadius = max(scaledRadius, distanceOfFurthestElement)
debug "Database pruned",
info "Database radius adjusted",
oldRadius = p.dataRadius,
newRadius = newRadius,
furthestDistanceInDb = furthestElementInDbDistance,
fractionOfDeletedContent = fractionOfDeletedContent
distanceOfFurthestElement
# both scaledRadius and furthestElementInDbDistance are smaller than current
# dataRadius, so the radius will constantly decrease through the node
# life time
# Both scaledRadius and distanceOfFurthestElement are smaller than current
# dataRadius, so the radius will constantly decrease through the node its
# lifetime.
p.dataRadius = newRadius
proc createGetHandler*(db: ContentDB): DbGetHandler =
@ -394,14 +387,21 @@ proc createStoreHandler*(
if res.kind == DbPruned:
portal_pruning_counter.inc(labelValues = [$p.protocolId])
portal_pruning_deleted_elements.set(
res.numOfDeletedElements.int64,
res.deletedElements.int64,
labelValues = [$p.protocolId]
)
p.adjustRadius(
res.fractionOfDeletedContent,
res.furthestStoredElementDistance
)
if res.deletedFraction > 0.0:
p.adjustRadius(res.deletedFraction, res.distanceOfFurthestElement)
else:
# Note:
# This can occur when the furthest content is bigger than the fraction
# size. This is unlikely to happen as it would require either very
# small storage capacity or a very small `contentDeletionFraction`
# combined with some big content.
info "Database pruning attempt resulted in no content deleted"
return
of Static:
# If the config is set statically, radius is not adjusted, and is kept
# constant thorugh node life time, also database max size is disabled

View File

@ -1115,7 +1115,7 @@ proc traceContentLookup*(p: PortalProtocol, target: ByteList, targetId: UInt256)
seen.incl(p.baseProtocol.localNode.id) # No need to discover our own node
for node in closestNodes:
seen.incl(node.id)
# Local node should be part of the responses
responses["0x" & $p.localNode.id] = TraceResponse(
durationMs: 0,
@ -1223,10 +1223,10 @@ proc traceContentLookup*(p: PortalProtocol, target: ByteList, targetId: UInt256)
of Content:
let duration = chronos.milliseconds(now(chronos.Moment) - ts)
# cancel any pending queries as the content has been found
for f in pendingQueries:
f.cancel()
f.cancelSoon()
portal_lookup_content_requests.observe(requestAmount)
let distance = p.routingTable.distance(content.src.id, targetId)

View File

@ -91,25 +91,29 @@ suite "Content Database":
usedSize2 == size6
test "ContentDB pruning":
# TODO: This test is extremely breakable when changing
# `contentDeletionFraction` and/or the used test values.
# Need to rework either this test, or the pruning mechanism, or probably
# both.
let
maxDbSize = uint32(100000)
maxDbSize = uint32(100_000)
db = ContentDB.new("", maxDbSize, inMemory = true)
furthestElement = u256(40)
secondFurthest = u256(30)
thirdFurthest = u256(20)
numBytes = 10000
numBytes = 10_000
pr1 = db.put(u256(1), genByteSeq(numBytes), u256(0))
pr2 = db.put(thirdFurthest, genByteSeq(numBytes), u256(0))
pr3 = db.put(u256(3), genByteSeq(numBytes), u256(0))
pr4 = db.put(u256(10), genByteSeq(numBytes), u256(0))
pr5 = db.put(u256(5), genByteSeq(numBytes), u256(0))
pr6 = db.put(u256(10), genByteSeq(numBytes), u256(0))
pr7 = db.put(furthestElement, genByteSeq(numBytes), u256(0))
pr8 = db.put(secondFurthest, genByteSeq(numBytes), u256(0))
pr6 = db.put(u256(11), genByteSeq(numBytes), u256(0))
pr7 = db.put(furthestElement, genByteSeq(2000), u256(0))
pr8 = db.put(secondFurthest, genByteSeq(2000), u256(0))
pr9 = db.put(u256(2), genByteSeq(numBytes), u256(0))
pr10 = db.put(u256(4), genByteSeq(numBytes), u256(0))
pr10 = db.put(u256(4), genByteSeq(12000), u256(0))
check:
pr1.kind == ContentStored
@ -124,11 +128,11 @@ suite "Content Database":
pr10.kind == DbPruned
check:
pr10.numOfDeletedElements == 2
pr10.deletedElements == 2
uint32(db.usedSize()) < maxDbSize
# With the current settings the 2 furthest elements will be deleted,
# i.e key 30 and 40. The furthest non deleted one will have key 20.
pr10.furthestStoredElementDistance == thirdFurthest
pr10.distanceOfFurthestElement == thirdFurthest
db.get(furthestElement).isNone()
db.get(secondFurthest).isNone()
db.get(thirdFurthest).isSome()

View File

@ -1,5 +1,5 @@
# Nimbus - Portal Network
# Copyright (c) 2021-2022 Status Research & Development GmbH
# Copyright (c) 2021-2023 Status Research & Development GmbH
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0).
@ -325,11 +325,15 @@ procSuite "Portal Wire Protocol Tests":
await node2.stopPortalProtocol()
asyncTest "Adjusting radius after hitting full database":
# TODO: This test is extremely breakable when changing
# `contentDeletionFraction` and/or the used test values.
# Need to rework either this test, or the pruning mechanism, or probably
# both.
let
node1 = initDiscoveryNode(
rng, PrivateKey.random(rng[]), localAddress(20303))
dbLimit = 100_000'u32
dbLimit = 400_000'u32
db = ContentDB.new("", dbLimit, inMemory = true)
m = StreamManager.new(node1)
q = newAsyncQueue[(Opt[NodeId], ContentKeysList, seq[seq[byte]])](50)
@ -343,7 +347,7 @@ procSuite "Portal Wire Protocol Tests":
let item = genByteSeq(10_000)
var distances: seq[UInt256] = @[]
for i in 0..8:
for i in 0..<40:
proto1.storeContent(ByteList.init(@[uint8(i)]), u256(i), item)
distances.add(u256(i) xor proto1.localNode.id)
@ -356,11 +360,12 @@ procSuite "Portal Wire Protocol Tests":
check:
db.get((distances[0] xor proto1.localNode.id)).isNone()
db.get((distances[1] xor proto1.localNode.id)).isNone()
db.get((distances[2] xor proto1.localNode.id)).isSome()
db.get((distances[2] xor proto1.localNode.id)).isNone()
db.get((distances[3] xor proto1.localNode.id)).isSome()
# The radius has been updated and is lower than the maximum start value.
proto1.dataRadius < UInt256.high
# Yet higher than or equal to the furthest non deleted element.
proto1.dataRadius >= distances[2]
proto1.dataRadius >= distances[3]
proto1.stop()
await node1.closeWait()