From 6338969dd5be5ec14780ff33d3aecf09079ec0b2 Mon Sep 17 00:00:00 2001 From: Kim De Mey Date: Thu, 16 Nov 2023 15:27:30 +0100 Subject: [PATCH] Lower the content deletion fraction and other contentdb clean-up (#1893) - The current setting of 0.25 was very big. Set to 0.5, which is potentially still large. This change did expose some issues with the current implementation and especially testing. - General clean-up, renaming for consistency, and re-ordering/restructuring/deletion of some code. - Fixed several typos - ... --- fluffy/content_db.nim | 164 ++++++++++----------- fluffy/network/wire/portal_protocol.nim | 6 +- fluffy/tests/test_content_db.nim | 20 ++- fluffy/tests/test_portal_wire_protocol.nim | 15 +- 4 files changed, 107 insertions(+), 98 deletions(-) diff --git a/fluffy/content_db.nim b/fluffy/content_db.nim index 1c0441f21..26c947d85 100644 --- a/fluffy/content_db.nim +++ b/fluffy/content_db.nim @@ -33,24 +33,23 @@ export kvstore_sqlite3 # thus depending on the network the right db needs to be selected. declareCounter portal_pruning_counter, - "Number of pruning event which happened during node lifetime", + "Number of pruning events which occured during the node's uptime", labels = ["protocol_id"] declareGauge portal_pruning_deleted_elements, - "Number of elements delted in last pruning", + "Number of elements deleted in the last pruning", labels = ["protocol_id"] +const + contentDeletionFraction = 0.05 ## 5% of the content will be deleted when the + ## storage capacity is hit and radius gets adjusted. + type RowInfo = tuple contentId: array[32, byte] payloadLength: int64 distance: array[32, byte] - ObjInfo* = object - contentId*: array[32, byte] - payloadLength*: int64 - distFrom*: UInt256 - ContentDB* = ref object kv: KvStoreRef storageCapacity*: uint64 @@ -69,9 +68,9 @@ type of ContentStored: discard of DbPruned: - furthestStoredElementDistance*: UInt256 - fractionOfDeletedContent*: float64 - numOfDeletedElements*: int64 + distanceOfFurthestElement*: UInt256 + deletedFraction*: float64 + deletedElements*: int64 func xorDistance( a: openArray[byte], @@ -181,10 +180,38 @@ proc del(db: ContentDB, key: openArray[byte]) = # TODO: Do we want to return the bool here too? discard db.kv.del(key).expectDb() -proc getSszDecoded*( +proc getSszDecoded( db: ContentDB, key: openArray[byte], T: type auto): Opt[T] = db.kv.getSszDecoded(key, T) +## Public ContentId based ContentDB calls + +# TODO: Could also decide to use the ContentKey SSZ bytestring, as this is what +# gets send over the network in requests, but that would be a bigger key. Or the +# same hashing could be done on it here. +# However ContentId itself is already derived through different digests +# depending on the content type, and this ContentId typically needs to be +# checked with the Radius/distance of the node anyhow. So lets see how we end up +# using this mostly in the code. + +proc get*(db: ContentDB, key: ContentId): Opt[seq[byte]] = + # TODO: Here it is unfortunate that ContentId is a uint256 instead of Digest256. + db.get(key.toBytesBE()) + +proc put*(db: ContentDB, key: ContentId, value: openArray[byte]) = + db.put(key.toBytesBE(), value) + +proc contains*(db: ContentDB, key: ContentId): bool = + db.contains(key.toBytesBE()) + +proc del*(db: ContentDB, key: ContentId) = + db.del(key.toBytesBE()) + +proc getSszDecoded*(db: ContentDB, key: ContentId, T: type auto): Opt[T] = + db.getSszDecoded(key.toBytesBE(), T) + +## Public database size, content and pruning related calls + proc reclaimSpace*(db: ContentDB): void = ## Runs sqlite VACUUM commands which rebuilds the db, repacking it into a ## minimal amount of disk space. @@ -233,39 +260,15 @@ proc contentCount*(db: ContentDB): int64 = count = res).expectDb() return count -## Public ContentId based ContentDB calls - -# TODO: Could also decide to use the ContentKey SSZ bytestring, as this is what -# gets send over the network in requests, but that would be a bigger key. Or the -# same hashing could be done on it here. -# However ContentId itself is already derived through different digests -# depending on the content type, and this ContentId typically needs to be -# checked with the Radius/distance of the node anyhow. So lets see how we end up -# using this mostly in the code. - -proc get*(db: ContentDB, key: ContentId): Opt[seq[byte]] = - # TODO: Here it is unfortunate that ContentId is a uint256 instead of Digest256. - db.get(key.toBytesBE()) - -proc put*(db: ContentDB, key: ContentId, value: openArray[byte]) = - db.put(key.toBytesBE(), value) - -proc contains*(db: ContentDB, key: ContentId): bool = - db.contains(key.toBytesBE()) - -proc del*(db: ContentDB, key: ContentId) = - db.del(key.toBytesBE()) - -proc getSszDecoded*(db: ContentDB, key: ContentId, T: type auto): Opt[T] = - db.getSszDecoded(key.toBytesBE(), T) - -proc deleteContentFraction( +proc deleteContentFraction*( db: ContentDB, target: UInt256, fraction: float64): (UInt256, int64, int64, int64) = ## Deletes at most `fraction` percent of content form database. - ## First, content furthest from provided `target` is deleted. - + ## Content furthest from provided `target` is deleted first. + # TODO: The usage of `db.contentSize()` for the deletion calculation versus + # `db.usedSize()` for the pruning threshold leads sometimes to some unexpected + # results of how much content gets up deleted. doAssert( fraction > 0 and fraction < 1, "Deleted fraction should be > 0 and < 1" @@ -273,22 +276,22 @@ proc deleteContentFraction( let totalContentSize = db.contentSize() let bytesToDelete = int64(fraction * float64(totalContentSize)) - var numOfDeletedElements: int64 = 0 + var deletedElements: int64 = 0 var ri: RowInfo - var bytesDeleted: int64 = 0 + var deletedBytes: int64 = 0 let targetBytes = target.toBytesBE() for e in db.getAllOrderedByDistanceStmt.exec(targetBytes, ri): - if bytesDeleted + ri.payloadLength < bytesToDelete: + if deletedBytes + ri.payloadLength <= bytesToDelete: db.del(ri.contentId) - bytesDeleted = bytesDeleted + ri.payloadLength - inc numOfDeletedElements + deletedBytes = deletedBytes + ri.payloadLength + inc deletedElements else: return ( UInt256.fromBytesBE(ri.distance), - bytesDeleted, + deletedBytes, totalContentSize, - numOfDeletedElements + deletedElements ) proc put*( @@ -296,7 +299,6 @@ proc put*( key: ContentId, value: openArray[byte], target: UInt256): PutResult = - db.put(key, value) # The used size is used as pruning threshold. This means that the database @@ -315,55 +317,46 @@ proc put*( if dbSize < int64(db.storageCapacity): return PutResult(kind: ContentStored) else: - # TODO Add some configuration for this magic number let ( - furthestNonDeletedElement, + distanceOfFurthestElement, deletedBytes, totalContentSize, deletedElements ) = - db.deleteContentFraction(target, 0.25) + db.deleteContentFraction(target, contentDeletionFraction) let deletedFraction = float64(deletedBytes) / float64(totalContentSize) + info "Deleted content fraction", deletedBytes, deletedElements, deletedFraction return PutResult( kind: DbPruned, - furthestStoredElementDistance: furthestNonDeletedElement, - fractionOfDeletedContent: deletedFraction, - numOfDeletedElements: deletedElements) + distanceOfFurthestElement: distanceOfFurthestElement, + deletedFraction: deletedFraction, + deletedElements: deletedElements) proc adjustRadius( p: PortalProtocol, - fractionOfDeletedContent: float64, - furthestElementInDbDistance: UInt256) = - if fractionOfDeletedContent == 0.0: - # even though pruning was triggered no content was deleted, it could happen - # in pathological case of really small database with really big values. - # log it as error as it should not happenn - error "Database pruning attempt resulted in no content deleted" - return - - # we need to invert fraction as our Uin256 implementation does not support + deletedFraction: float64, + distanceOfFurthestElement: UInt256) = + # Invert fraction as the UInt256 implementation does not support # multiplication by float - let invertedFractionAsInt = int64(1.0 / fractionOfDeletedContent) - + let invertedFractionAsInt = int64(1.0 / deletedFraction) let scaledRadius = p.dataRadius div u256(invertedFractionAsInt) - # Chose larger value to avoid situation, where furthestElementInDbDistance - # is super close to local id, so local radius would end up too small - # to accept any more data to local database - # If scaledRadius radius will be larger it will still contain all elements - let newRadius = max(scaledRadius, furthestElementInDbDistance) + # Choose a larger value to avoid the situation where the + # `distanceOfFurthestElement is very close to the local id so that the local + # radius would end up too small to accept any more data to the database. + # If scaledRadius radius will be larger it will still contain all elements. + let newRadius = max(scaledRadius, distanceOfFurthestElement) - debug "Database pruned", + info "Database radius adjusted", oldRadius = p.dataRadius, newRadius = newRadius, - furthestDistanceInDb = furthestElementInDbDistance, - fractionOfDeletedContent = fractionOfDeletedContent + distanceOfFurthestElement - # both scaledRadius and furthestElementInDbDistance are smaller than current - # dataRadius, so the radius will constantly decrease through the node - # life time + # Both scaledRadius and distanceOfFurthestElement are smaller than current + # dataRadius, so the radius will constantly decrease through the node its + # lifetime. p.dataRadius = newRadius proc createGetHandler*(db: ContentDB): DbGetHandler = @@ -394,14 +387,21 @@ proc createStoreHandler*( if res.kind == DbPruned: portal_pruning_counter.inc(labelValues = [$p.protocolId]) portal_pruning_deleted_elements.set( - res.numOfDeletedElements.int64, + res.deletedElements.int64, labelValues = [$p.protocolId] ) - p.adjustRadius( - res.fractionOfDeletedContent, - res.furthestStoredElementDistance - ) + if res.deletedFraction > 0.0: + p.adjustRadius(res.deletedFraction, res.distanceOfFurthestElement) + else: + # Note: + # This can occur when the furthest content is bigger than the fraction + # size. This is unlikely to happen as it would require either very + # small storage capacity or a very small `contentDeletionFraction` + # combined with some big content. + info "Database pruning attempt resulted in no content deleted" + return + of Static: # If the config is set statically, radius is not adjusted, and is kept # constant thorugh node life time, also database max size is disabled diff --git a/fluffy/network/wire/portal_protocol.nim b/fluffy/network/wire/portal_protocol.nim index 310208530..954b11090 100644 --- a/fluffy/network/wire/portal_protocol.nim +++ b/fluffy/network/wire/portal_protocol.nim @@ -1115,7 +1115,7 @@ proc traceContentLookup*(p: PortalProtocol, target: ByteList, targetId: UInt256) seen.incl(p.baseProtocol.localNode.id) # No need to discover our own node for node in closestNodes: seen.incl(node.id) - + # Local node should be part of the responses responses["0x" & $p.localNode.id] = TraceResponse( durationMs: 0, @@ -1223,10 +1223,10 @@ proc traceContentLookup*(p: PortalProtocol, target: ByteList, targetId: UInt256) of Content: let duration = chronos.milliseconds(now(chronos.Moment) - ts) - + # cancel any pending queries as the content has been found for f in pendingQueries: - f.cancel() + f.cancelSoon() portal_lookup_content_requests.observe(requestAmount) let distance = p.routingTable.distance(content.src.id, targetId) diff --git a/fluffy/tests/test_content_db.nim b/fluffy/tests/test_content_db.nim index f26afe8e2..025b3b24a 100644 --- a/fluffy/tests/test_content_db.nim +++ b/fluffy/tests/test_content_db.nim @@ -91,25 +91,29 @@ suite "Content Database": usedSize2 == size6 test "ContentDB pruning": + # TODO: This test is extremely breakable when changing + # `contentDeletionFraction` and/or the used test values. + # Need to rework either this test, or the pruning mechanism, or probably + # both. let - maxDbSize = uint32(100000) + maxDbSize = uint32(100_000) db = ContentDB.new("", maxDbSize, inMemory = true) furthestElement = u256(40) secondFurthest = u256(30) thirdFurthest = u256(20) - numBytes = 10000 + numBytes = 10_000 pr1 = db.put(u256(1), genByteSeq(numBytes), u256(0)) pr2 = db.put(thirdFurthest, genByteSeq(numBytes), u256(0)) pr3 = db.put(u256(3), genByteSeq(numBytes), u256(0)) pr4 = db.put(u256(10), genByteSeq(numBytes), u256(0)) pr5 = db.put(u256(5), genByteSeq(numBytes), u256(0)) - pr6 = db.put(u256(10), genByteSeq(numBytes), u256(0)) - pr7 = db.put(furthestElement, genByteSeq(numBytes), u256(0)) - pr8 = db.put(secondFurthest, genByteSeq(numBytes), u256(0)) + pr6 = db.put(u256(11), genByteSeq(numBytes), u256(0)) + pr7 = db.put(furthestElement, genByteSeq(2000), u256(0)) + pr8 = db.put(secondFurthest, genByteSeq(2000), u256(0)) pr9 = db.put(u256(2), genByteSeq(numBytes), u256(0)) - pr10 = db.put(u256(4), genByteSeq(numBytes), u256(0)) + pr10 = db.put(u256(4), genByteSeq(12000), u256(0)) check: pr1.kind == ContentStored @@ -124,11 +128,11 @@ suite "Content Database": pr10.kind == DbPruned check: - pr10.numOfDeletedElements == 2 + pr10.deletedElements == 2 uint32(db.usedSize()) < maxDbSize # With the current settings the 2 furthest elements will be deleted, # i.e key 30 and 40. The furthest non deleted one will have key 20. - pr10.furthestStoredElementDistance == thirdFurthest + pr10.distanceOfFurthestElement == thirdFurthest db.get(furthestElement).isNone() db.get(secondFurthest).isNone() db.get(thirdFurthest).isSome() diff --git a/fluffy/tests/test_portal_wire_protocol.nim b/fluffy/tests/test_portal_wire_protocol.nim index 8a7dd7f82..b09f9c1ea 100644 --- a/fluffy/tests/test_portal_wire_protocol.nim +++ b/fluffy/tests/test_portal_wire_protocol.nim @@ -1,5 +1,5 @@ # Nimbus - Portal Network -# Copyright (c) 2021-2022 Status Research & Development GmbH +# Copyright (c) 2021-2023 Status Research & Development GmbH # Licensed and distributed under either of # * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT). # * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0). @@ -325,11 +325,15 @@ procSuite "Portal Wire Protocol Tests": await node2.stopPortalProtocol() asyncTest "Adjusting radius after hitting full database": + # TODO: This test is extremely breakable when changing + # `contentDeletionFraction` and/or the used test values. + # Need to rework either this test, or the pruning mechanism, or probably + # both. let node1 = initDiscoveryNode( rng, PrivateKey.random(rng[]), localAddress(20303)) - dbLimit = 100_000'u32 + dbLimit = 400_000'u32 db = ContentDB.new("", dbLimit, inMemory = true) m = StreamManager.new(node1) q = newAsyncQueue[(Opt[NodeId], ContentKeysList, seq[seq[byte]])](50) @@ -343,7 +347,7 @@ procSuite "Portal Wire Protocol Tests": let item = genByteSeq(10_000) var distances: seq[UInt256] = @[] - for i in 0..8: + for i in 0..<40: proto1.storeContent(ByteList.init(@[uint8(i)]), u256(i), item) distances.add(u256(i) xor proto1.localNode.id) @@ -356,11 +360,12 @@ procSuite "Portal Wire Protocol Tests": check: db.get((distances[0] xor proto1.localNode.id)).isNone() db.get((distances[1] xor proto1.localNode.id)).isNone() - db.get((distances[2] xor proto1.localNode.id)).isSome() + db.get((distances[2] xor proto1.localNode.id)).isNone() + db.get((distances[3] xor proto1.localNode.id)).isSome() # The radius has been updated and is lower than the maximum start value. proto1.dataRadius < UInt256.high # Yet higher than or equal to the furthest non deleted element. - proto1.dataRadius >= distances[2] + proto1.dataRadius >= distances[3] proto1.stop() await node1.closeWait()