feat: add bloom filter (#3)

This commit is contained in:
Akhil 2025-01-13 13:49:28 +04:00 committed by GitHub
parent a83dcc0331
commit 5df71ad3ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 369 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
nimcache
nimcache/*
tests/bloom
nim-bloom/bloom
.DS_Store
src/.DS_Store

123
src/bloom.nim Normal file
View File

@ -0,0 +1,123 @@
from math import ceil, ln, pow, round
import hashes
import strutils
import results
import private/probabilities
type
BloomFilter* = object
capacity*: int
errorRate*: float
kHashes*: int
mBits*: int
intArray: seq[int]
{.push overflowChecks: off.} # Turn off overflow checks for hashing operations
proc hashN(item: string, n: int, maxValue: int): int =
## Get the nth hash using Nim's built-in hash function using
## the double hashing technique from Kirsch and Mitzenmacher, 2008:
## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
let
hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes
hashB = abs(hash(item & " b")) mod maxValue # string concatenation
abs((hashA + n * hashB)) mod maxValue
# # Use bit rotation for second hash instead of string concatenation if speed if preferred over FP-rate
# # Rotate left by 21 bits (lower the rotation, higher the speed but higher the FP-rate too)
# hashB = abs(
# ((h shl 21) or (h shr (sizeof(int) * 8 - 21)))
# ) mod maxValue
# abs((hashA + n.int64 * hashB)) mod maxValue
{.pop.}
proc getMOverNBitsForK*(k: int, targetError: float,
probabilityTable = kErrors): Result[int, string] =
## Returns the optimal number of m/n bits for a given k.
if k notin 0..12:
return err("K must be <= 12 if forceNBitsPerElem is not also specified.")
for mOverN in 2..probabilityTable[k].high:
if probabilityTable[k][mOverN] < targetError:
return ok(mOverN)
err("Specified value of k and error rate not achievable using less than 4 bytes / element.")
proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
forceNBitsPerElem = 0): Result[BloomFilter, string] =
## Initializes a Bloom filter with specified parameters.
##
## Parameters:
## - capacity: Expected number of elements to be inserted
## - errorRate: Desired false positive rate (e.g., 0.01 for 1%)
## - k: Optional number of hash functions. If 0, calculated optimally
## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
## useful tables on k and m/n (n bits per element) combinations.
## - forceNBitsPerElem: Optional override for bits per element
var
kHashes: int
nBitsPerElem: int
if k < 1: # Calculate optimal k and use that
let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
kHashes = round(ln(2.float) * bitsPerElem).int
nBitsPerElem = round(bitsPerElem).int
else: # Use specified k if possible
if forceNBitsPerElem < 1: # Use lookup table
let mOverNRes = getMOverNBitsForK(k = k, targetError = errorRate)
if mOverNRes.isErr:
return err(mOverNRes.error)
nBitsPerElem = mOverNRes.value
else:
nBitsPerElem = forceNBitsPerElem
kHashes = k
let
mBits = capacity * nBitsPerElem
mInts = 1 + mBits div (sizeof(int) * 8)
ok(BloomFilter(
capacity: capacity,
errorRate: errorRate,
kHashes: kHashes,
mBits: mBits,
intArray: newSeq[int](mInts)
))
proc `$`*(bf: BloomFilter): string =
## Prints the configuration of the Bloom filter.
"Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." %
[$bf.capacity,
formatFloat(bf.errorRate, format = ffScientific, precision = 1),
$bf.kHashes,
$(bf.mBits div bf.capacity)]
proc computeHashes(bf: BloomFilter, item: string): seq[int] =
var hashes = newSeq[int](bf.kHashes)
for i in 0..<bf.kHashes:
hashes[i] = hashN(item, i, bf.mBits)
hashes
proc insert*(bf: var BloomFilter, item: string) =
## Insert an item (string) into the Bloom filter.
let hashSet = bf.computeHashes(item)
for h in hashSet:
let
intAddress = h div (sizeof(int) * 8)
bitOffset = h mod (sizeof(int) * 8)
bf.intArray[intAddress] = bf.intArray[intAddress] or (1 shl bitOffset)
proc lookup*(bf: BloomFilter, item: string): bool =
## Lookup an item (string) in the Bloom filter.
## If the item is present, ``lookup`` is guaranteed to return ``true``.
## If the item is not present, ``lookup`` will return ``false``
## with a probability 1 - ``bf.errorRate``.
let hashSet = bf.computeHashes(item)
for h in hashSet:
let
intAddress = h div (sizeof(int) * 8)
bitOffset = h mod (sizeof(int) * 8)
currentInt = bf.intArray[intAddress]
if currentInt != (currentInt or (1 shl bitOffset)):
return false
true

View File

@ -0,0 +1,98 @@
#
# ### Probability table declaration, in private/ for readability ###
# Table for k hashes from 1..12 from http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html
# Iterate along the sequence at position [k] until the error rate is < specified, otherwise
# raise an error.
#
type
TErrorForK = seq[float]
TAllErrorRates* = array[0..12, TErrorForK]
let kErrors*: TAllErrorRates = [
@[1.0],
@[1.0, 1.0, 0.3930000000, 0.2830000000, 0.2210000000, 0.1810000000,
0.1540000000, 0.1330000000, 0.1180000000, 0.1050000000, 0.0952000000,
0.0869000000, 0.0800000000, 0.0740000000, 0.0689000000, 0.0645000000,
0.0606000000, 0.0571000000, 0.0540000000, 0.0513000000, 0.0488000000,
0.0465000000, 0.0444000000, 0.0425000000, 0.0408000000, 0.0392000000,
0.0377000000, 0.0364000000, 0.0351000000, 0.0339000000, 0.0328000000,
0.0317000000, 0.0308000000],
@[1.0, 1.0, 0.4000000000, 0.2370000000, 0.1550000000, 0.1090000000,
0.0804000000, 0.0618000000, 0.0489000000, 0.0397000000, 0.0329000000,
0.0276000000, 0.0236000000, 0.0203000000, 0.0177000000, 0.0156000000,
0.0138000000, 0.0123000000, 0.0111000000, 0.0099800000, 0.0090600000,
0.0082500000, 0.0075500000, 0.0069400000, 0.0063900000, 0.0059100000,
0.0054800000, 0.0051000000, 0.0047500000, 0.0044400000, 0.0041600000,
0.0039000000, 0.0036700000],
@[1.0, 1.0, 1.0, 0.2530000000, 0.1470000000, 0.0920000000, 0.0609000000,
0.0423000000, 0.0306000000, 0.0228000000, 0.0174000000, 0.0136000000,
0.0108000000, 0.0087500000, 0.0071800000, 0.0059600000, 0.0050000000,
0.0042300000, 0.0036200000, 0.0031200000, 0.0027000000, 0.0023600000,
0.0020700000, 0.0018300000, 0.0016200000, 0.0014500000, 0.0012900000,
0.0011600000, 0.0010500000, 0.0009490000, 0.0008620000, 0.0007850000,
0.0007170000],
@[1.0, 1.0, 1.0, 1.0, 0.1600000000, 0.0920000000, 0.0561000000, 0.0359000000,
0.0240000000, 0.0166000000, 0.0118000000, 0.0086400000, 0.0064600000,
0.0049200000, 0.0038100000, 0.0030000000, 0.0023900000, 0.0019300000,
0.0015800000, 0.0013000000, 0.0010800000, 0.0009050000, 0.0007640000,
0.0006490000, 0.0005550000, 0.0004780000, 0.0004130000, 0.0003590000,
0.0003140000, 0.0002760000, 0.0002430000, 0.0002150000, 0.0001910000],
@[1.0, 1.0, 1.0, 1.0, 1.0, 0.1010000000, 0.0578000000, 0.0347000000,
0.0217000000, 0.0141000000, 0.0094300000, 0.0065000000, 0.0045900000,
0.0033200000, 0.0024400000, 0.0018300000, 0.0013900000, 0.0010700000,
0.0008390000, 0.0006630000, 0.0005300000, 0.0004270000, 0.0003470000,
0.0002850000, 0.0002350000, 0.0001960000, 0.0001640000, 0.0001380000,
0.0001170000, 0.0000996000, 0.0000853000, 0.0000733000, 0.0000633000],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0638000000, 0.0364000000, 0.0216000000,
0.0133000000, 0.0084400000, 0.0055200000, 0.0037100000, 0.0025500000,
0.0017900000, 0.0012800000, 0.0009350000, 0.0006920000, 0.0005190000,
0.0003940000, 0.0003030000, 0.0002360000, 0.0001850000, 0.0001470000,
0.0001170000, 0.0000944000, 0.0000766000, 0.0000626000, 0.0000515000,
0.0000426000, 0.0000355000, 0.0000297000, 0.0000250000],
@[1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 0.0229000000, 0.0135000000, 0.0081900000,
0.0051300000, 0.0032900000, 0.0021700000, 0.0014600000, 0.0010000000,
0.0007020000, 0.0004990000, 0.0003600000, 0.0002640000, 0.0001960000,
0.0001470000, 0.0001120000, 0.0000856000, 0.0000663000, 0.0000518000,
0.0000408000, 0.0000324000, 0.0000259000, 0.0000209000, 0.0000169000,
0.0000138000, 0.0000113000],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 0.0145000000, 0.0084600000, 0.0050900000, 0.0031400000, 0.0019900000,
0.0012900000, 0.0008520000, 0.0005740000, 0.0003940000, 0.0002750000,
0.0001940000, 0.0001400000, 0.0001010000, 0.0000746000, 0.0000555000,
0.0000417000, 0.0000316000, 0.0000242000, 0.0000187000, 0.0000146000,
0.0000114000, 0.0000090100, 0.0000071600, 0.0000057300],
@[1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0053100000, 0.0031700000,
0.0019400000, 0.0012100000, 0.0007750000, 0.0005050000, 0.0003350000,
0.0002260000, 0.0001550000, 0.0001080000, 0.0000759000, 0.0000542000,
0.0000392000, 0.0000286000, 0.0000211000, 0.0000157000, 0.0000118000,
0.0000089600, 0.0000068500, 0.0000052800, 0.0000041000, 0.0000032000],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0033400000,
0.0019800000, 0.0012000000, 0.0007440000, 0.0004700000, 0.0003020000,
0.0001980000, 0.0001320000, 0.0000889000, 0.0000609000, 0.0000423000,
0.0000297000, 0.0000211000, 0.0000152000, 0.0000110000, 0.0000080700,
0.0000059700, 0.0000044500, 0.0000033500, 0.0000025400, 0.0000019400],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0021000000, 0.0012400000, 0.0007470000, 0.0004590000, 0.0002870000,
0.0001830000, 0.0001180000, 0.0000777000, 0.0000518000, 0.0000350000,
0.0000240000, 0.0000166000, 0.0000116000, 0.0000082300, 0.0000058900,
0.0000042500, 0.0000031000, 0.0000022800, 0.0000016900, 0.0000012600],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0007780000, 0.0004660000, 0.0002840000, 0.0001760000, 0.0001110000,
0.0000712000, 0.0000463000, 0.0000305000, 0.0000204000, 0.0000138000,
0.0000094200, 0.0000065200, 0.0000045600, 0.0000032200, 0.0000022900,
0.0000016500, 0.0000012000, 0.0000008740]
]

142
tests/test_bloom.nim Normal file
View File

@ -0,0 +1,142 @@
import unittest, results, strutils
import ../src/bloom
from random import rand, randomize
suite "bloom filter":
setup:
let nElementsToTest = 10000
let bfResult = initializeBloomFilter(capacity = nElementsToTest, errorRate = 0.001)
check bfResult.isOk
var bf = bfResult.get
randomize(2882) # Seed the RNG
var
sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
testElements = newSeq[string](nElementsToTest)
for i in 0..<nElementsToTest:
var newString = ""
for j in 0..7:
newString.add(sampleChars[rand(51)])
testElements[i] = newString
for item in testElements:
bf.insert(item)
test "initialization parameters":
check bf.capacity == nElementsToTest
check bf.errorRate == 0.001
check bf.kHashes == 10
check bf.mBits div bf.capacity == 15 # bits per element
test "basic operations":
check bf.lookup("nonexistent") == false # Test empty lookup
let bf2Result = initializeBloomFilter(100, 0.01)
check bf2Result.isOk
var bf2 = bf2Result.get
bf2.insert("test string")
check bf2.lookup("test string") == true
check bf2.lookup("different string") == false
test "error rate":
var falsePositives = 0
let testSize = nElementsToTest div 2
for i in 0..<testSize:
var testString = ""
for j in 0..8: # Different length than setup
testString.add(sampleChars[rand(51)])
if bf.lookup(testString):
falsePositives.inc()
let actualErrorRate = falsePositives.float / testSize.float
check actualErrorRate < bf.errorRate * 1.5 # Allow some margin
test "perfect recall":
var lookupErrors = 0
for item in testElements:
if not bf.lookup(item):
lookupErrors.inc()
check lookupErrors == 0
test "k/m bits specification":
# Test error case for k > 12
let errorCase = getMOverNBitsForK(k = 13, targetError = 0.01)
check errorCase.isErr
check errorCase.error == "K must be <= 12 if forceNBitsPerElem is not also specified."
# Test error case for unachievable error rate
let errorCase2 = getMOverNBitsForK(k = 2, targetError = 0.00001)
check errorCase2.isErr
check errorCase2.error == "Specified value of k and error rate not achievable using less than 4 bytes / element."
# Test success cases
let case1 = getMOverNBitsForK(k = 2, targetError = 0.1)
check case1.isOk
check case1.value == 6
let case2 = getMOverNBitsForK(k = 7, targetError = 0.01)
check case2.isOk
check case2.value == 10
let case3 = getMOverNBitsForK(k = 7, targetError = 0.001)
check case3.isOk
check case3.value == 16
let bf2Result = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20)
check bf2Result.isOk
let bf2 = bf2Result.get
check bf2.kHashes == 4
check bf2.mBits == 200000
test "string representation":
let bf3Result = initializeBloomFilter(1000, 0.01, k = 4)
check bf3Result.isOk
let bf3 = bf3Result.get
let str = $bf3
check str.contains("1000") # Capacity
check str.contains("4 hash") # Hash functions
check str.contains("1.0e-02") # Error rate in scientific notation
suite "bloom filter special cases":
test "different patterns of strings":
const testSize = 10_000
let patterns = @[
"shortstr",
repeat("a", 1000), # Very long string
"special@#$%^&*()", # Special characters
"unicode→★∑≈", # Unicode characters
repeat("pattern", 10) # Repeating pattern
]
let bfResult = initializeBloomFilter(testSize, 0.01)
check bfResult.isOk
var bf = bfResult.get
var inserted = newSeq[string](testSize)
# Test pattern handling
for pattern in patterns:
bf.insert(pattern)
assert bf.lookup(pattern), "failed lookup pattern: " & pattern
# Test general insertion and lookup
for i in 0..<testSize:
inserted[i] = $i & "test" & $rand(1000)
bf.insert(inserted[i])
# Verify all insertions
var lookupErrors = 0
for item in inserted:
if not bf.lookup(item):
lookupErrors.inc()
check lookupErrors == 0
# Check false positive rate
var falsePositives = 0
let fpTestSize = testSize div 2
for i in 0..<fpTestSize:
let testItem = "notpresent" & $i & $rand(1000)
if bf.lookup(testItem):
falsePositives.inc()
let fpRate = falsePositives.float / fpTestSize.float
check fpRate < bf.errorRate * 1.5 # Allow some margin but should be close to target