mirror of
https://github.com/logos-messaging/nim-sds.git
synced 2026-01-02 14:13:07 +00:00
feat: add bloom filter (#3)
This commit is contained in:
parent
a83dcc0331
commit
5df71ad3ea
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
nimcache
|
||||||
|
nimcache/*
|
||||||
|
tests/bloom
|
||||||
|
nim-bloom/bloom
|
||||||
|
.DS_Store
|
||||||
|
src/.DS_Store
|
||||||
123
src/bloom.nim
Normal file
123
src/bloom.nim
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
from math import ceil, ln, pow, round
|
||||||
|
import hashes
|
||||||
|
import strutils
|
||||||
|
import results
|
||||||
|
import private/probabilities
|
||||||
|
|
||||||
|
type
|
||||||
|
BloomFilter* = object
|
||||||
|
capacity*: int
|
||||||
|
errorRate*: float
|
||||||
|
kHashes*: int
|
||||||
|
mBits*: int
|
||||||
|
intArray: seq[int]
|
||||||
|
|
||||||
|
{.push overflowChecks: off.} # Turn off overflow checks for hashing operations
|
||||||
|
|
||||||
|
proc hashN(item: string, n: int, maxValue: int): int =
|
||||||
|
## Get the nth hash using Nim's built-in hash function using
|
||||||
|
## the double hashing technique from Kirsch and Mitzenmacher, 2008:
|
||||||
|
## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
|
||||||
|
let
|
||||||
|
hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes
|
||||||
|
hashB = abs(hash(item & " b")) mod maxValue # string concatenation
|
||||||
|
abs((hashA + n * hashB)) mod maxValue
|
||||||
|
# # Use bit rotation for second hash instead of string concatenation if speed if preferred over FP-rate
|
||||||
|
# # Rotate left by 21 bits (lower the rotation, higher the speed but higher the FP-rate too)
|
||||||
|
# hashB = abs(
|
||||||
|
# ((h shl 21) or (h shr (sizeof(int) * 8 - 21)))
|
||||||
|
# ) mod maxValue
|
||||||
|
# abs((hashA + n.int64 * hashB)) mod maxValue
|
||||||
|
|
||||||
|
{.pop.}
|
||||||
|
|
||||||
|
proc getMOverNBitsForK*(k: int, targetError: float,
|
||||||
|
probabilityTable = kErrors): Result[int, string] =
|
||||||
|
## Returns the optimal number of m/n bits for a given k.
|
||||||
|
if k notin 0..12:
|
||||||
|
return err("K must be <= 12 if forceNBitsPerElem is not also specified.")
|
||||||
|
|
||||||
|
for mOverN in 2..probabilityTable[k].high:
|
||||||
|
if probabilityTable[k][mOverN] < targetError:
|
||||||
|
return ok(mOverN)
|
||||||
|
|
||||||
|
err("Specified value of k and error rate not achievable using less than 4 bytes / element.")
|
||||||
|
|
||||||
|
proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
|
||||||
|
forceNBitsPerElem = 0): Result[BloomFilter, string] =
|
||||||
|
## Initializes a Bloom filter with specified parameters.
|
||||||
|
##
|
||||||
|
## Parameters:
|
||||||
|
## - capacity: Expected number of elements to be inserted
|
||||||
|
## - errorRate: Desired false positive rate (e.g., 0.01 for 1%)
|
||||||
|
## - k: Optional number of hash functions. If 0, calculated optimally
|
||||||
|
## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
|
||||||
|
## useful tables on k and m/n (n bits per element) combinations.
|
||||||
|
## - forceNBitsPerElem: Optional override for bits per element
|
||||||
|
var
|
||||||
|
kHashes: int
|
||||||
|
nBitsPerElem: int
|
||||||
|
|
||||||
|
if k < 1: # Calculate optimal k and use that
|
||||||
|
let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
|
||||||
|
kHashes = round(ln(2.float) * bitsPerElem).int
|
||||||
|
nBitsPerElem = round(bitsPerElem).int
|
||||||
|
else: # Use specified k if possible
|
||||||
|
if forceNBitsPerElem < 1: # Use lookup table
|
||||||
|
let mOverNRes = getMOverNBitsForK(k = k, targetError = errorRate)
|
||||||
|
if mOverNRes.isErr:
|
||||||
|
return err(mOverNRes.error)
|
||||||
|
nBitsPerElem = mOverNRes.value
|
||||||
|
else:
|
||||||
|
nBitsPerElem = forceNBitsPerElem
|
||||||
|
kHashes = k
|
||||||
|
|
||||||
|
let
|
||||||
|
mBits = capacity * nBitsPerElem
|
||||||
|
mInts = 1 + mBits div (sizeof(int) * 8)
|
||||||
|
|
||||||
|
ok(BloomFilter(
|
||||||
|
capacity: capacity,
|
||||||
|
errorRate: errorRate,
|
||||||
|
kHashes: kHashes,
|
||||||
|
mBits: mBits,
|
||||||
|
intArray: newSeq[int](mInts)
|
||||||
|
))
|
||||||
|
|
||||||
|
proc `$`*(bf: BloomFilter): string =
|
||||||
|
## Prints the configuration of the Bloom filter.
|
||||||
|
"Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." %
|
||||||
|
[$bf.capacity,
|
||||||
|
formatFloat(bf.errorRate, format = ffScientific, precision = 1),
|
||||||
|
$bf.kHashes,
|
||||||
|
$(bf.mBits div bf.capacity)]
|
||||||
|
|
||||||
|
proc computeHashes(bf: BloomFilter, item: string): seq[int] =
|
||||||
|
var hashes = newSeq[int](bf.kHashes)
|
||||||
|
for i in 0..<bf.kHashes:
|
||||||
|
hashes[i] = hashN(item, i, bf.mBits)
|
||||||
|
hashes
|
||||||
|
|
||||||
|
proc insert*(bf: var BloomFilter, item: string) =
|
||||||
|
## Insert an item (string) into the Bloom filter.
|
||||||
|
let hashSet = bf.computeHashes(item)
|
||||||
|
for h in hashSet:
|
||||||
|
let
|
||||||
|
intAddress = h div (sizeof(int) * 8)
|
||||||
|
bitOffset = h mod (sizeof(int) * 8)
|
||||||
|
bf.intArray[intAddress] = bf.intArray[intAddress] or (1 shl bitOffset)
|
||||||
|
|
||||||
|
proc lookup*(bf: BloomFilter, item: string): bool =
|
||||||
|
## Lookup an item (string) in the Bloom filter.
|
||||||
|
## If the item is present, ``lookup`` is guaranteed to return ``true``.
|
||||||
|
## If the item is not present, ``lookup`` will return ``false``
|
||||||
|
## with a probability 1 - ``bf.errorRate``.
|
||||||
|
let hashSet = bf.computeHashes(item)
|
||||||
|
for h in hashSet:
|
||||||
|
let
|
||||||
|
intAddress = h div (sizeof(int) * 8)
|
||||||
|
bitOffset = h mod (sizeof(int) * 8)
|
||||||
|
currentInt = bf.intArray[intAddress]
|
||||||
|
if currentInt != (currentInt or (1 shl bitOffset)):
|
||||||
|
return false
|
||||||
|
true
|
||||||
98
src/private/probabilities.nim
Normal file
98
src/private/probabilities.nim
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
#
|
||||||
|
# ### Probability table declaration, in private/ for readability ###
|
||||||
|
# Table for k hashes from 1..12 from http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html
|
||||||
|
# Iterate along the sequence at position [k] until the error rate is < specified, otherwise
|
||||||
|
# raise an error.
|
||||||
|
#
|
||||||
|
|
||||||
|
type
|
||||||
|
TErrorForK = seq[float]
|
||||||
|
TAllErrorRates* = array[0..12, TErrorForK]
|
||||||
|
|
||||||
|
let kErrors*: TAllErrorRates = [
|
||||||
|
@[1.0],
|
||||||
|
@[1.0, 1.0, 0.3930000000, 0.2830000000, 0.2210000000, 0.1810000000,
|
||||||
|
0.1540000000, 0.1330000000, 0.1180000000, 0.1050000000, 0.0952000000,
|
||||||
|
0.0869000000, 0.0800000000, 0.0740000000, 0.0689000000, 0.0645000000,
|
||||||
|
0.0606000000, 0.0571000000, 0.0540000000, 0.0513000000, 0.0488000000,
|
||||||
|
0.0465000000, 0.0444000000, 0.0425000000, 0.0408000000, 0.0392000000,
|
||||||
|
0.0377000000, 0.0364000000, 0.0351000000, 0.0339000000, 0.0328000000,
|
||||||
|
0.0317000000, 0.0308000000],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 0.4000000000, 0.2370000000, 0.1550000000, 0.1090000000,
|
||||||
|
0.0804000000, 0.0618000000, 0.0489000000, 0.0397000000, 0.0329000000,
|
||||||
|
0.0276000000, 0.0236000000, 0.0203000000, 0.0177000000, 0.0156000000,
|
||||||
|
0.0138000000, 0.0123000000, 0.0111000000, 0.0099800000, 0.0090600000,
|
||||||
|
0.0082500000, 0.0075500000, 0.0069400000, 0.0063900000, 0.0059100000,
|
||||||
|
0.0054800000, 0.0051000000, 0.0047500000, 0.0044400000, 0.0041600000,
|
||||||
|
0.0039000000, 0.0036700000],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0, 0.2530000000, 0.1470000000, 0.0920000000, 0.0609000000,
|
||||||
|
0.0423000000, 0.0306000000, 0.0228000000, 0.0174000000, 0.0136000000,
|
||||||
|
0.0108000000, 0.0087500000, 0.0071800000, 0.0059600000, 0.0050000000,
|
||||||
|
0.0042300000, 0.0036200000, 0.0031200000, 0.0027000000, 0.0023600000,
|
||||||
|
0.0020700000, 0.0018300000, 0.0016200000, 0.0014500000, 0.0012900000,
|
||||||
|
0.0011600000, 0.0010500000, 0.0009490000, 0.0008620000, 0.0007850000,
|
||||||
|
0.0007170000],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0, 1.0, 0.1600000000, 0.0920000000, 0.0561000000, 0.0359000000,
|
||||||
|
0.0240000000, 0.0166000000, 0.0118000000, 0.0086400000, 0.0064600000,
|
||||||
|
0.0049200000, 0.0038100000, 0.0030000000, 0.0023900000, 0.0019300000,
|
||||||
|
0.0015800000, 0.0013000000, 0.0010800000, 0.0009050000, 0.0007640000,
|
||||||
|
0.0006490000, 0.0005550000, 0.0004780000, 0.0004130000, 0.0003590000,
|
||||||
|
0.0003140000, 0.0002760000, 0.0002430000, 0.0002150000, 0.0001910000],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0, 1.0, 1.0, 0.1010000000, 0.0578000000, 0.0347000000,
|
||||||
|
0.0217000000, 0.0141000000, 0.0094300000, 0.0065000000, 0.0045900000,
|
||||||
|
0.0033200000, 0.0024400000, 0.0018300000, 0.0013900000, 0.0010700000,
|
||||||
|
0.0008390000, 0.0006630000, 0.0005300000, 0.0004270000, 0.0003470000,
|
||||||
|
0.0002850000, 0.0002350000, 0.0001960000, 0.0001640000, 0.0001380000,
|
||||||
|
0.0001170000, 0.0000996000, 0.0000853000, 0.0000733000, 0.0000633000],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0638000000, 0.0364000000, 0.0216000000,
|
||||||
|
0.0133000000, 0.0084400000, 0.0055200000, 0.0037100000, 0.0025500000,
|
||||||
|
0.0017900000, 0.0012800000, 0.0009350000, 0.0006920000, 0.0005190000,
|
||||||
|
0.0003940000, 0.0003030000, 0.0002360000, 0.0001850000, 0.0001470000,
|
||||||
|
0.0001170000, 0.0000944000, 0.0000766000, 0.0000626000, 0.0000515000,
|
||||||
|
0.0000426000, 0.0000355000, 0.0000297000, 0.0000250000],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0,
|
||||||
|
1.0, 1.0, 1.0, 1.0, 1.0, 0.0229000000, 0.0135000000, 0.0081900000,
|
||||||
|
0.0051300000, 0.0032900000, 0.0021700000, 0.0014600000, 0.0010000000,
|
||||||
|
0.0007020000, 0.0004990000, 0.0003600000, 0.0002640000, 0.0001960000,
|
||||||
|
0.0001470000, 0.0001120000, 0.0000856000, 0.0000663000, 0.0000518000,
|
||||||
|
0.0000408000, 0.0000324000, 0.0000259000, 0.0000209000, 0.0000169000,
|
||||||
|
0.0000138000, 0.0000113000],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
|
||||||
|
1.0, 0.0145000000, 0.0084600000, 0.0050900000, 0.0031400000, 0.0019900000,
|
||||||
|
0.0012900000, 0.0008520000, 0.0005740000, 0.0003940000, 0.0002750000,
|
||||||
|
0.0001940000, 0.0001400000, 0.0001010000, 0.0000746000, 0.0000555000,
|
||||||
|
0.0000417000, 0.0000316000, 0.0000242000, 0.0000187000, 0.0000146000,
|
||||||
|
0.0000114000, 0.0000090100, 0.0000071600, 0.0000057300],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0,
|
||||||
|
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0053100000, 0.0031700000,
|
||||||
|
0.0019400000, 0.0012100000, 0.0007750000, 0.0005050000, 0.0003350000,
|
||||||
|
0.0002260000, 0.0001550000, 0.0001080000, 0.0000759000, 0.0000542000,
|
||||||
|
0.0000392000, 0.0000286000, 0.0000211000, 0.0000157000, 0.0000118000,
|
||||||
|
0.0000089600, 0.0000068500, 0.0000052800, 0.0000041000, 0.0000032000],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0033400000,
|
||||||
|
0.0019800000, 0.0012000000, 0.0007440000, 0.0004700000, 0.0003020000,
|
||||||
|
0.0001980000, 0.0001320000, 0.0000889000, 0.0000609000, 0.0000423000,
|
||||||
|
0.0000297000, 0.0000211000, 0.0000152000, 0.0000110000, 0.0000080700,
|
||||||
|
0.0000059700, 0.0000044500, 0.0000033500, 0.0000025400, 0.0000019400],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
|
||||||
|
0.0021000000, 0.0012400000, 0.0007470000, 0.0004590000, 0.0002870000,
|
||||||
|
0.0001830000, 0.0001180000, 0.0000777000, 0.0000518000, 0.0000350000,
|
||||||
|
0.0000240000, 0.0000166000, 0.0000116000, 0.0000082300, 0.0000058900,
|
||||||
|
0.0000042500, 0.0000031000, 0.0000022800, 0.0000016900, 0.0000012600],
|
||||||
|
|
||||||
|
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
|
||||||
|
0.0007780000, 0.0004660000, 0.0002840000, 0.0001760000, 0.0001110000,
|
||||||
|
0.0000712000, 0.0000463000, 0.0000305000, 0.0000204000, 0.0000138000,
|
||||||
|
0.0000094200, 0.0000065200, 0.0000045600, 0.0000032200, 0.0000022900,
|
||||||
|
0.0000016500, 0.0000012000, 0.0000008740]
|
||||||
|
]
|
||||||
142
tests/test_bloom.nim
Normal file
142
tests/test_bloom.nim
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
import unittest, results, strutils
|
||||||
|
import ../src/bloom
|
||||||
|
from random import rand, randomize
|
||||||
|
|
||||||
|
suite "bloom filter":
|
||||||
|
setup:
|
||||||
|
let nElementsToTest = 10000
|
||||||
|
let bfResult = initializeBloomFilter(capacity = nElementsToTest, errorRate = 0.001)
|
||||||
|
check bfResult.isOk
|
||||||
|
var bf = bfResult.get
|
||||||
|
randomize(2882) # Seed the RNG
|
||||||
|
var
|
||||||
|
sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||||
|
testElements = newSeq[string](nElementsToTest)
|
||||||
|
|
||||||
|
for i in 0..<nElementsToTest:
|
||||||
|
var newString = ""
|
||||||
|
for j in 0..7:
|
||||||
|
newString.add(sampleChars[rand(51)])
|
||||||
|
testElements[i] = newString
|
||||||
|
|
||||||
|
for item in testElements:
|
||||||
|
bf.insert(item)
|
||||||
|
|
||||||
|
test "initialization parameters":
|
||||||
|
check bf.capacity == nElementsToTest
|
||||||
|
check bf.errorRate == 0.001
|
||||||
|
check bf.kHashes == 10
|
||||||
|
check bf.mBits div bf.capacity == 15 # bits per element
|
||||||
|
|
||||||
|
test "basic operations":
|
||||||
|
check bf.lookup("nonexistent") == false # Test empty lookup
|
||||||
|
|
||||||
|
let bf2Result = initializeBloomFilter(100, 0.01)
|
||||||
|
check bf2Result.isOk
|
||||||
|
var bf2 = bf2Result.get
|
||||||
|
bf2.insert("test string")
|
||||||
|
check bf2.lookup("test string") == true
|
||||||
|
check bf2.lookup("different string") == false
|
||||||
|
|
||||||
|
test "error rate":
|
||||||
|
var falsePositives = 0
|
||||||
|
let testSize = nElementsToTest div 2
|
||||||
|
for i in 0..<testSize:
|
||||||
|
var testString = ""
|
||||||
|
for j in 0..8: # Different length than setup
|
||||||
|
testString.add(sampleChars[rand(51)])
|
||||||
|
if bf.lookup(testString):
|
||||||
|
falsePositives.inc()
|
||||||
|
|
||||||
|
let actualErrorRate = falsePositives.float / testSize.float
|
||||||
|
check actualErrorRate < bf.errorRate * 1.5 # Allow some margin
|
||||||
|
|
||||||
|
test "perfect recall":
|
||||||
|
var lookupErrors = 0
|
||||||
|
for item in testElements:
|
||||||
|
if not bf.lookup(item):
|
||||||
|
lookupErrors.inc()
|
||||||
|
check lookupErrors == 0
|
||||||
|
|
||||||
|
test "k/m bits specification":
|
||||||
|
# Test error case for k > 12
|
||||||
|
let errorCase = getMOverNBitsForK(k = 13, targetError = 0.01)
|
||||||
|
check errorCase.isErr
|
||||||
|
check errorCase.error == "K must be <= 12 if forceNBitsPerElem is not also specified."
|
||||||
|
|
||||||
|
# Test error case for unachievable error rate
|
||||||
|
let errorCase2 = getMOverNBitsForK(k = 2, targetError = 0.00001)
|
||||||
|
check errorCase2.isErr
|
||||||
|
check errorCase2.error == "Specified value of k and error rate not achievable using less than 4 bytes / element."
|
||||||
|
|
||||||
|
# Test success cases
|
||||||
|
let case1 = getMOverNBitsForK(k = 2, targetError = 0.1)
|
||||||
|
check case1.isOk
|
||||||
|
check case1.value == 6
|
||||||
|
|
||||||
|
let case2 = getMOverNBitsForK(k = 7, targetError = 0.01)
|
||||||
|
check case2.isOk
|
||||||
|
check case2.value == 10
|
||||||
|
|
||||||
|
let case3 = getMOverNBitsForK(k = 7, targetError = 0.001)
|
||||||
|
check case3.isOk
|
||||||
|
check case3.value == 16
|
||||||
|
|
||||||
|
let bf2Result = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20)
|
||||||
|
check bf2Result.isOk
|
||||||
|
let bf2 = bf2Result.get
|
||||||
|
check bf2.kHashes == 4
|
||||||
|
check bf2.mBits == 200000
|
||||||
|
|
||||||
|
test "string representation":
|
||||||
|
let bf3Result = initializeBloomFilter(1000, 0.01, k = 4)
|
||||||
|
check bf3Result.isOk
|
||||||
|
let bf3 = bf3Result.get
|
||||||
|
let str = $bf3
|
||||||
|
check str.contains("1000") # Capacity
|
||||||
|
check str.contains("4 hash") # Hash functions
|
||||||
|
check str.contains("1.0e-02") # Error rate in scientific notation
|
||||||
|
|
||||||
|
suite "bloom filter special cases":
|
||||||
|
test "different patterns of strings":
|
||||||
|
const testSize = 10_000
|
||||||
|
let patterns = @[
|
||||||
|
"shortstr",
|
||||||
|
repeat("a", 1000), # Very long string
|
||||||
|
"special@#$%^&*()", # Special characters
|
||||||
|
"unicode→★∑≈", # Unicode characters
|
||||||
|
repeat("pattern", 10) # Repeating pattern
|
||||||
|
]
|
||||||
|
|
||||||
|
let bfResult = initializeBloomFilter(testSize, 0.01)
|
||||||
|
check bfResult.isOk
|
||||||
|
var bf = bfResult.get
|
||||||
|
var inserted = newSeq[string](testSize)
|
||||||
|
|
||||||
|
# Test pattern handling
|
||||||
|
for pattern in patterns:
|
||||||
|
bf.insert(pattern)
|
||||||
|
assert bf.lookup(pattern), "failed lookup pattern: " & pattern
|
||||||
|
|
||||||
|
# Test general insertion and lookup
|
||||||
|
for i in 0..<testSize:
|
||||||
|
inserted[i] = $i & "test" & $rand(1000)
|
||||||
|
bf.insert(inserted[i])
|
||||||
|
|
||||||
|
# Verify all insertions
|
||||||
|
var lookupErrors = 0
|
||||||
|
for item in inserted:
|
||||||
|
if not bf.lookup(item):
|
||||||
|
lookupErrors.inc()
|
||||||
|
check lookupErrors == 0
|
||||||
|
|
||||||
|
# Check false positive rate
|
||||||
|
var falsePositives = 0
|
||||||
|
let fpTestSize = testSize div 2
|
||||||
|
for i in 0..<fpTestSize:
|
||||||
|
let testItem = "notpresent" & $i & $rand(1000)
|
||||||
|
if bf.lookup(testItem):
|
||||||
|
falsePositives.inc()
|
||||||
|
|
||||||
|
let fpRate = falsePositives.float / fpTestSize.float
|
||||||
|
check fpRate < bf.errorRate * 1.5 # Allow some margin but should be close to target
|
||||||
Loading…
x
Reference in New Issue
Block a user