mirror of
https://github.com/logos-messaging/nim-sds.git
synced 2026-01-03 22:53:12 +00:00
feat: std vs custom hash selection, add more tests
chore: update readme and some fixes from review
This commit is contained in:
parent
5d065c168b
commit
8006af303c
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
|
||||
.DS_Store
|
||||
3
nim-bloom/.gitignore
vendored
3
nim-bloom/.gitignore
vendored
@ -4,4 +4,5 @@ tests/test
|
||||
bloom
|
||||
*.html
|
||||
*.css
|
||||
/.DS_Store
|
||||
.DS_Store
|
||||
src/.DS_Store
|
||||
@ -1,41 +1,95 @@
|
||||
nim-bloom
|
||||
============
|
||||
# nim-bloom
|
||||
|
||||
Bloom filter implementation in Nim. Uses a C implementation of MurmurHash3 for optimal speed and numeric distribution.
|
||||
A high-performance Bloom filter implementation in Nim. Supports both Nim's built-in MurmurHash2 (default) and an optional 128-bit MurmurHash3 implementation for large-scale use cases.
|
||||
|
||||
On a 10 year old Macbook Pro Retina the test case for 10M insertions executes in ~4.0 seconds and 10M lookups in ~3.5 seconds for a Bloom filter with a 1 in 1000 error rate (0.001). This is ~2.5M insertions/sec and ~2.9M lookups/sec on a single thread (but passing the `-d:release` flag to the Nim compiler and thus activating the C compiler's optimizations). If k is lowered to 5 or 6 vs. a larger "optimal" number, performance further increases to ~4M ops/sec. Note that this test is for a Bloom filter ~20-25MB in size and thus accurately reflects the cost of main memory accesses (vs. a smaller filter that might fit solely in L3 cache, for example, and can achieve several million additional ops/sec).
|
||||
## Features
|
||||
|
||||
- Fast string element insertion and lookup
|
||||
- Configurable error rates
|
||||
- Choice between standard Nim hash (MurmurHash2) and extended 128-bit MurmurHash3
|
||||
- Optimized for both small and large-scale use cases
|
||||
- Comprehensive test suite
|
||||
|
||||
Currently supports inserting and looking up string elements. Forthcoming features include:
|
||||
* Support for other types beyond strings
|
||||
* Support for iterables in the insert method
|
||||
* Persistence
|
||||
## Performance
|
||||
|
||||
Historical benchmark using MurmurHash3 implementation on a 10-year-old Macbook Pro Retina:
|
||||
- ~2.5M insertions/sec (~4.0 seconds for 10M insertions)
|
||||
- ~2.9M lookups/sec (~3.5 seconds for 10M lookups)
|
||||
- Test configuration: 0.001 error rate, Bloom filter size ~20-25MB
|
||||
- Compiled with `-d:release` flag
|
||||
|
||||
quickstart
|
||||
====
|
||||
Quick functionality demo:
|
||||
```
|
||||
These numbers reflect performance outside of CPU cache, as the filter size was intentionally larger than L3 cache. Performance can be several million operations/sec higher with smaller filters that fit in cache.
|
||||
|
||||
Current performance will vary based on:
|
||||
- Choice of hash function (standard Nim hash vs extended MurmurHash3)
|
||||
- Hardware specifications
|
||||
- Data size and memory access patterns
|
||||
- Compiler optimizations
|
||||
|
||||
The default configuration (using Nim's built-in hash) is optimized for typical use cases, while the extended hash option (MurmurHash3) provides better collision resistance for large-scale applications at a slight performance cost.
|
||||
|
||||
## Quickstart
|
||||
|
||||
Basic usage:
|
||||
```nim
|
||||
import bloom
|
||||
|
||||
# Initialize with default hash (suitable for most uses)
|
||||
var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.001)
|
||||
echo bf # Get characteristics of the Bloom filter
|
||||
echo bf.lookup("An element not in the Bloom filter") # Prints 'false'
|
||||
bf.insert("Here we go...")
|
||||
assert(bf.lookup("Here we go..."))
|
||||
echo bf # Print Bloom filter characteristics
|
||||
echo bf.lookup("test") # false
|
||||
bf.insert("test")
|
||||
assert bf.lookup("test") # true
|
||||
|
||||
# For large-scale usage (>1M elements), consider using extended hash
|
||||
var largeBf = initializeBloomFilter(
|
||||
capacity = 2_000_000,
|
||||
errorRate = 0.001,
|
||||
useExtendedHash = true
|
||||
)
|
||||
```
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
By default, the Bloom filter will use a mathematically optimal number of k hash functions, which minimizes the amount of error per bit of storage required. In many cases, however, it may be advantageous to specify a smaller value of k in order to save time hashing. This is supported by passing an explicit `k` parameter, which will then either create an optimal Bloom filter for the specified error rate.[1]
|
||||
The Bloom filter can be configured in several ways:
|
||||
|
||||
[1] If `k` <= 12 and the number of required bytes per element is <= 4. If either of these conditions doesn't hold, a fully manual Bloom filter can be constructed by passing both `k` and `force_n_bits_per_elem`.
|
||||
|
||||
Example:
|
||||
1. Default initialization (automatically calculates optimal parameters):
|
||||
```nim
|
||||
var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.001)
|
||||
```
|
||||
var bf2 = initializeBloomFilter(capacity = 10000, errorRate = 0.001, k = 5)
|
||||
assert bf2.kHashes == 5
|
||||
assert bf2.nBitsPerElem == 18
|
||||
|
||||
var bf3 = initializeBloomFilter(capacity = 10000, errorRate = 0.001, k = 5, forceNBitsPerElem = 12)
|
||||
assert bf3.kHashes == 5
|
||||
assert bf3.nBitsPerElem == 12 # But note, however, that bf.errorRate will *not* be correct
|
||||
2. Specify custom number of hash functions:
|
||||
```nim
|
||||
var bf = initializeBloomFilter(
|
||||
capacity = 10000,
|
||||
errorRate = 0.001,
|
||||
k = 5 # Use 5 hash functions instead of calculated optimal
|
||||
)
|
||||
```
|
||||
|
||||
3. Fully manual configuration:
|
||||
```nim
|
||||
var bf = initializeBloomFilter(
|
||||
capacity = 10000,
|
||||
errorRate = 0.001,
|
||||
k = 5,
|
||||
forceNBitsPerElem = 12,
|
||||
useExtendedHash = false # Use standard hash (default)
|
||||
)
|
||||
```
|
||||
|
||||
Note: When specifying `k`, it must be ≤ 12 unless `forceNBitsPerElem` is also specified. The implementation will raise a `BloomFilterError` if parameters would result in suboptimal performance.
|
||||
|
||||
## Hash Function Selection
|
||||
|
||||
- Default: Uses Nim's built-in hash (MurmurHash2), suitable for most use cases
|
||||
- Extended: Uses 128-bit MurmurHash3, better for large sets (>1M elements) where collision resistance is critical
|
||||
|
||||
Choose extended hash by setting `useExtendedHash = true` during initialization.
|
||||
|
||||
## Testing
|
||||
|
||||
Run the test suite:
|
||||
```bash
|
||||
nimble test
|
||||
```
|
||||
@ -3,11 +3,11 @@ import hashes
|
||||
import strutils
|
||||
import private/probabilities
|
||||
|
||||
# Import MurmurHash3 code and compile at the same time as Nim code
|
||||
# Import MurmurHash3 code for large-scale use cases
|
||||
{.compile: "murmur3.c".}
|
||||
|
||||
type
|
||||
BloomFilterError = object of CatchableError
|
||||
BloomFilterError* = object of CatchableError
|
||||
MurmurHashes = array[0..1, int]
|
||||
BloomFilter* = object
|
||||
capacity*: int
|
||||
@ -15,28 +15,27 @@ type
|
||||
kHashes*: int
|
||||
mBits*: int
|
||||
intArray: seq[int]
|
||||
nBitsPerElem*: int
|
||||
useMurmurHash*: bool
|
||||
useExtendedHash*: bool # Use 128-bit MurmurHash3 for very large filters
|
||||
|
||||
{.push overflowChecks: off.} # Turn off overflow checks for hashing operations
|
||||
|
||||
proc rawMurmurHash(key: cstring, len: int, seed: uint32,
|
||||
outHashes: var MurmurHashes): void {.
|
||||
importc: "MurmurHash3_x64_128".}
|
||||
|
||||
proc murmurHash(key: string, seed = 0'u32): MurmurHashes =
|
||||
rawMurmurHash(key, key.len, seed, outHashes = result)
|
||||
|
||||
proc hashA(item: string, maxValue: int): int =
|
||||
hash(item) mod maxValue
|
||||
|
||||
proc hashB(item: string, maxValue: int): int =
|
||||
hash(item & " b") mod maxValue
|
||||
rawMurmurHash(key, key.len, seed, result)
|
||||
|
||||
proc hashN(item: string, n: int, maxValue: int): int =
|
||||
## Get the nth hash of a string using the formula hashA + n * hashB
|
||||
## which uses 2 hash functions vs. k and has comparable properties
|
||||
## See Kirsch and Mitzenmacher, 2008:
|
||||
## Get the nth hash using Nim's built-in hash function using
|
||||
## the double hashing technique from Kirsch and Mitzenmacher, 2008:
|
||||
## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
|
||||
abs((hashA(item, maxValue) + n * hashB(item, maxValue))) mod maxValue
|
||||
let
|
||||
hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes
|
||||
hashB = abs(hash(item & " b")) mod maxValue
|
||||
abs((hashA + n * hashB)) mod maxValue
|
||||
|
||||
{.pop.} # Restore overflow checks
|
||||
|
||||
proc getMOverNBitsForK(k: int, targetError: float,
|
||||
probabilityTable = kErrors): int =
|
||||
@ -54,28 +53,19 @@ proc getMOverNBitsForK(k: int, targetError: float,
|
||||
|
||||
proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
|
||||
forceNBitsPerElem = 0,
|
||||
useMurmurHash = true): BloomFilter =
|
||||
useExtendedHash = false): BloomFilter =
|
||||
## Initializes a Bloom filter, using a specified ``capacity``,
|
||||
## ``errorRate``, and – optionally – specific number of k hash functions.
|
||||
## If ``kHashes`` is < 1 (default argument is 0), ``kHashes`` will be
|
||||
## optimally calculated on the fly. Otherwise, ``kHashes`` will be set to
|
||||
## the passed integer, which requires that ``forceNBitsPerElem`` is
|
||||
## also set to be greater than 0. Otherwise a ``BloomFilterError``
|
||||
## exception is raised.
|
||||
## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
|
||||
## useful tables on k and m/n (n bits per element) combinations.
|
||||
##
|
||||
## The Bloom filter uses the MurmurHash3 implementation by default,
|
||||
## though it can fall back to using the built-in nim ``hash`` function
|
||||
## if ``useMurmurHash = false``. This is compiled alongside the Nim
|
||||
## code using the ``{.compile.}`` pragma.
|
||||
## optimally calculated. If capacity > 1M elements, consider setting
|
||||
## useExtendedHash = true to use 128-bit MurmurHash3 for better
|
||||
## collision resistance.
|
||||
var
|
||||
kHashes: int
|
||||
bitsPerElem: float
|
||||
nBitsPerElem: int
|
||||
|
||||
if k < 1: # Calculate optimal k and use that
|
||||
bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
|
||||
let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
|
||||
kHashes = round(ln(2.float) * bitsPerElem).int
|
||||
nBitsPerElem = round(bitsPerElem).int
|
||||
else: # Use specified k if possible
|
||||
@ -89,42 +79,42 @@ proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
|
||||
mBits = capacity * nBitsPerElem
|
||||
mInts = 1 + mBits div (sizeof(int) * 8)
|
||||
|
||||
BloomFilter(capacity: capacity, errorRate: errorRate, kHashes: kHashes,
|
||||
mBits: mBits, intArray: newSeq[int](mInts), nBitsPerElem: nBitsPerElem,
|
||||
useMurmurHash: useMurmurHash)
|
||||
BloomFilter(
|
||||
capacity: capacity,
|
||||
errorRate: errorRate,
|
||||
kHashes: kHashes,
|
||||
mBits: mBits,
|
||||
intArray: newSeq[int](mInts),
|
||||
useExtendedHash: useExtendedHash
|
||||
)
|
||||
|
||||
proc `$`*(bf: BloomFilter): string =
|
||||
## Prints the capacity, set error rate, number of k hash functions,
|
||||
## and total bits of memory allocated by the Bloom filter.
|
||||
"Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits per stored element." %
|
||||
"Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." %
|
||||
[$bf.capacity,
|
||||
formatFloat(bf.errorRate, format = ffScientific, precision = 1),
|
||||
$bf.kHashes, $bf.nBitsPerElem]
|
||||
$bf.kHashes,
|
||||
$(bf.mBits div bf.capacity)]
|
||||
|
||||
{.push overflowChecks: off.}
|
||||
{.push overflowChecks: off.} # Turn off overflow checks for hash computations
|
||||
|
||||
proc hashMurmur(bf: BloomFilter, key: string): seq[int] =
|
||||
result.newSeq(bf.kHashes)
|
||||
let murmurHashes = murmurHash(key, seed = 0'u32)
|
||||
for i in 0..<bf.kHashes:
|
||||
result[i] = abs(murmurHashes[0] + i * murmurHashes[1]) mod bf.mBits
|
||||
|
||||
{.pop.}
|
||||
|
||||
proc hashNim(bf: BloomFilter, key: string): seq[int] =
|
||||
result.newSeq(bf.kHashes)
|
||||
for i in 0..<bf.kHashes:
|
||||
result[i] = hashN(key, i, bf.mBits)
|
||||
|
||||
proc hash(bf: BloomFilter, key: string): seq[int] =
|
||||
if bf.useMurmurHash:
|
||||
bf.hashMurmur(key)
|
||||
proc computeHashes(bf: BloomFilter, item: string): seq[int] =
|
||||
var hashes = newSeq[int](bf.kHashes)
|
||||
if bf.useExtendedHash:
|
||||
let murmurHashes = murmurHash(item, 0'u32)
|
||||
for i in 0..<bf.kHashes:
|
||||
hashes[i] = abs((murmurHashes[0] + i.int64 * murmurHashes[1].int64).int) mod bf.mBits
|
||||
else:
|
||||
bf.hashNim(key)
|
||||
for i in 0..<bf.kHashes:
|
||||
hashes[i] = hashN(item, i, bf.mBits)
|
||||
hashes
|
||||
|
||||
{.pop.} # Restore overflow checks
|
||||
|
||||
proc insert*(bf: var BloomFilter, item: string) =
|
||||
## Insert an item (string) into the Bloom filter.
|
||||
var hashSet = bf.hash(item)
|
||||
let hashSet = bf.computeHashes(item)
|
||||
for h in hashSet:
|
||||
let
|
||||
intAddress = h div (sizeof(int) * 8)
|
||||
@ -136,7 +126,7 @@ proc lookup*(bf: BloomFilter, item: string): bool =
|
||||
## If the item is present, ``lookup`` is guaranteed to return ``true``.
|
||||
## If the item is not present, ``lookup`` will return ``false``
|
||||
## with a probability 1 - ``bf.errorRate``.
|
||||
var hashSet = bf.hash(item)
|
||||
let hashSet = bf.computeHashes(item)
|
||||
for h in hashSet:
|
||||
let
|
||||
intAddress = h div (sizeof(int) * 8)
|
||||
@ -144,101 +134,4 @@ proc lookup*(bf: BloomFilter, item: string): bool =
|
||||
currentInt = bf.intArray[intAddress]
|
||||
if currentInt != (currentInt or (1 shl bitOffset)):
|
||||
return false
|
||||
return true
|
||||
|
||||
when isMainModule:
|
||||
from random import rand, randomize
|
||||
import times
|
||||
|
||||
# Test murmurhash 3
|
||||
echo("Testing MurmurHash3 code...")
|
||||
var hashOutputs: MurmurHashes
|
||||
hashOutputs = [0, 0]
|
||||
rawMurmurHash("hello", 5, 0, hashOutputs)
|
||||
assert int(hashOutputs[0]) == -3758069500696749310 # Correct murmur outputs (cast to int64)
|
||||
assert int(hashOutputs[1]) == 6565844092913065241
|
||||
|
||||
let hashOutputs2 = murmurHash("hello", 0)
|
||||
assert hashOutputs2[0] == hashOutputs[0]
|
||||
assert hashOutputs2[1] == hashOutputs[1]
|
||||
let hashOutputs3 = murmurHash("hello", 10)
|
||||
assert hashOutputs3[0] != hashOutputs[0]
|
||||
assert hashOutputs3[1] != hashOutputs[1]
|
||||
|
||||
# Some quick and dirty tests (not complete)
|
||||
var nElementsToTest = 100000
|
||||
var bf = initializeBloomFilter(nElementsToTest, 0.001)
|
||||
assert(bf of BloomFilter)
|
||||
echo(bf)
|
||||
|
||||
var bf2 = initializeBloomFilter(10000, 0.001, k = 4,
|
||||
forceNBitsPerElem = 20)
|
||||
assert(bf2 of BloomFilter)
|
||||
echo(bf2)
|
||||
|
||||
echo("Testing insertions and lookups...")
|
||||
echo("Test element in BF2?: ", bf2.lookup("testing"))
|
||||
echo("Inserting element.")
|
||||
bf2.insert("testing")
|
||||
echo("Test element in BF2?: ", bf2.lookup("testing"))
|
||||
assert(bf2.lookup("testing"))
|
||||
|
||||
# Now test for speed with bf
|
||||
randomize(2882) # Seed the RNG
|
||||
var
|
||||
sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||
kTestElements, sampleLetters: seq[string]
|
||||
kTestElements = newSeq[string](nElementsToTest)
|
||||
sampleLetters = newSeq[string](62)
|
||||
|
||||
for i in 0..(nElementsToTest - 1):
|
||||
var newString = ""
|
||||
for j in 0..7:
|
||||
newString.add(sampleChars[rand(51)])
|
||||
kTestElements[i] = newString
|
||||
|
||||
var startTime, endTime: float
|
||||
startTime = cpuTime()
|
||||
for i in 0..(nElementsToTest - 1):
|
||||
bf.insert(kTestElements[i])
|
||||
endTime = cpuTime()
|
||||
echo("Took ", formatFloat(endTime - startTime, format = ffDecimal,
|
||||
precision = 4), " seconds to insert ", nElementsToTest, " items.")
|
||||
|
||||
var falsePositives = 0
|
||||
for i in 0..(nElementsToTest - 1):
|
||||
var falsePositiveString = ""
|
||||
for j in 0..8: # By definition not in bf as 9 chars not 8
|
||||
falsePositiveString.add(sampleChars[rand(51)])
|
||||
if bf.lookup(falsePositiveString):
|
||||
falsePositives += 1
|
||||
|
||||
echo("N false positives (of ", nElementsToTest, " lookups): ", falsePositives)
|
||||
echo("False positive rate ", formatFloat(falsePositives / nElementsToTest,
|
||||
format = ffDecimal, precision = 4))
|
||||
|
||||
var lookupErrors = 0
|
||||
startTime = cpuTime()
|
||||
for i in 0..(nElementsToTest - 1):
|
||||
if not bf.lookup(kTestElements[i]):
|
||||
lookupErrors += 1
|
||||
endTime = cpuTime()
|
||||
echo("Took ", formatFloat(endTime - startTime, format = ffDecimal,
|
||||
precision = 4), " seconds to lookup ", nElementsToTest, " items.")
|
||||
|
||||
echo("N lookup errors (should be 0): ", lookupErrors)
|
||||
|
||||
# Finally test correct k / mOverN specification,
|
||||
# first case raises an error, second works
|
||||
try:
|
||||
discard getMOverNBitsForK(k = 2, targetError = 0.00001)
|
||||
assert false
|
||||
except BloomFilterError:
|
||||
assert true
|
||||
|
||||
assert getMOverNBitsForK(k = 2, targetError = 0.1) == 6
|
||||
assert getMOverNBitsForK(k = 7, targetError = 0.01) == 10
|
||||
assert getMOverNBitsForK(k = 7, targetError = 0.001) == 16
|
||||
|
||||
var bf3 = initializeBloomFilter(1000, 0.01, k = 4)
|
||||
assert bf3.nBitsPerElem == 11
|
||||
true
|
||||
@ -4,7 +4,7 @@ from random import rand, randomize
|
||||
import times
|
||||
|
||||
suite "murmur":
|
||||
# Test murmurhash 3
|
||||
# Test murmurhash 3 when enabled
|
||||
setup:
|
||||
var hashOutputs: MurmurHashes
|
||||
hashOutputs = [0, 0]
|
||||
@ -24,18 +24,85 @@ suite "murmur":
|
||||
check hashOutputs3[0] != hashOutputs[0]
|
||||
check hashOutputs3[1] != hashOutputs[1]
|
||||
|
||||
suite "hashing comparison":
|
||||
test "hash distribution":
|
||||
const testSize = 10000
|
||||
var standardCollisions = 0
|
||||
var extendedCollisions = 0
|
||||
|
||||
var bfStandard = initializeBloomFilter(testSize, 0.01, useExtendedHash = false)
|
||||
var bfExtended = initializeBloomFilter(testSize, 0.01, useExtendedHash = true)
|
||||
|
||||
# Generate test data
|
||||
var testData = newSeq[string](testSize)
|
||||
for i in 0..<testSize:
|
||||
testData[i] = $i & "salt" & $rand(1000000)
|
||||
|
||||
# Test standard hash
|
||||
var startTime = cpuTime()
|
||||
for item in testData:
|
||||
bfStandard.insert(item)
|
||||
let standardTime = cpuTime() - startTime
|
||||
|
||||
# Test extended hash
|
||||
startTime = cpuTime()
|
||||
for item in testData:
|
||||
bfExtended.insert(item)
|
||||
let extendedTime = cpuTime() - startTime
|
||||
|
||||
echo "Standard hash time: ", standardTime
|
||||
echo "Extended hash time: ", extendedTime
|
||||
|
||||
test "hash implementation switch":
|
||||
# Create two filters with different hash implementations
|
||||
let standardBf = initializeBloomFilter(1000, 0.01, useExtendedHash = false)
|
||||
let murmurBf = initializeBloomFilter(1000, 0.01, useExtendedHash = true)
|
||||
|
||||
# Insert same elements
|
||||
let testData = ["test1", "test2", "test3", "test4", "test5"]
|
||||
for item in testData:
|
||||
var stdBf = standardBf # Create mutable copies
|
||||
var murBf = murmurBf
|
||||
stdBf.insert(item)
|
||||
murBf.insert(item)
|
||||
|
||||
# Verify both can find their items
|
||||
check stdBf.lookup(item)
|
||||
check murBf.lookup(item)
|
||||
|
||||
# Verify false positives work as expected for both
|
||||
let nonExistentItem = "definitely-not-in-filter"
|
||||
var falsePositiveStd = standardBf.lookup(nonExistentItem)
|
||||
var falsePositiveMur = murmurBf.lookup(nonExistentItem)
|
||||
|
||||
# Both should maintain their error rates
|
||||
# Run multiple times to get a sample
|
||||
var fpCountStd = 0
|
||||
var fpCountMur = 0
|
||||
for i in 0..1000:
|
||||
let testItem = "test-" & $i
|
||||
if standardBf.lookup(testItem): fpCountStd += 1
|
||||
if murmurBf.lookup(testItem): fpCountMur += 1
|
||||
|
||||
# Both should have similar false positive rates within reasonable bounds
|
||||
let fpRateStd = fpCountStd.float / 1000.0
|
||||
let fpRateMur = fpCountMur.float / 1000.0
|
||||
|
||||
check abs(fpRateStd - fpRateMur) < 0.01 # Should be reasonably close
|
||||
check fpRateStd < standardBf.errorRate * 1.5 # Should not exceed target error rate by too much
|
||||
check fpRateMur < murmurBf.errorRate * 1.5
|
||||
|
||||
echo "Standard hash false positive rate: ", fpRateStd
|
||||
echo "Murmur hash false positive rate: ", fpRateMur
|
||||
|
||||
suite "bloom":
|
||||
|
||||
setup:
|
||||
let nElementsToTest = 100000
|
||||
var bf = initializeBloomFilter(capacity = nElementsToTest, errorRate = 0.001)
|
||||
randomize(2882) # Seed the RNG
|
||||
var
|
||||
sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||
kTestElements, sampleLetters: seq[string]
|
||||
kTestElements = newSeq[string](nElementsToTest)
|
||||
sampleLetters = newSeq[string](62)
|
||||
kTestElements = newSeq[string](nElementsToTest)
|
||||
|
||||
for i in 0..<nElementsToTest:
|
||||
var newString = ""
|
||||
@ -46,51 +113,83 @@ suite "bloom":
|
||||
for i in 0..<nElementsToTest:
|
||||
bf.insert(kTestElements[i])
|
||||
|
||||
test "params":
|
||||
test "init parameters":
|
||||
check(bf.capacity == nElementsToTest)
|
||||
check(bf.errorRate == 0.001)
|
||||
check(bf.kHashes == 10)
|
||||
check(bf.nBitsPerElem == 15)
|
||||
check(bf.mBits == 15 * nElementsToTest)
|
||||
check(bf.useMurmurHash == true)
|
||||
|
||||
test "not hit":
|
||||
test "hash mode selection":
|
||||
let bf1 = initializeBloomFilter(100, 0.01)
|
||||
check(bf1.useExtendedHash == false)
|
||||
|
||||
let bf2 = initializeBloomFilter(100, 0.01, useExtendedHash = true)
|
||||
check(bf2.useExtendedHash == true)
|
||||
|
||||
test "basic operations":
|
||||
# Test empty lookup
|
||||
check(bf.lookup("nothing") == false)
|
||||
|
||||
# Test insert and lookup
|
||||
bf.insert("teststring")
|
||||
check(bf.lookup("teststring") == true)
|
||||
|
||||
# Test multiple inserts
|
||||
bf.insert("test1")
|
||||
bf.insert("test2")
|
||||
check(bf.lookup("test1") == true)
|
||||
check(bf.lookup("test2") == true)
|
||||
check(bf.lookup("test3") == false)
|
||||
|
||||
test "hit":
|
||||
bf.insert("hit")
|
||||
check(bf.lookup("hit") == true)
|
||||
|
||||
test "force params":
|
||||
var bf2 = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20)
|
||||
check(bf2.capacity == 10000)
|
||||
check(bf2.errorRate == 0.001)
|
||||
check(bf2.kHashes == 4)
|
||||
check(bf2.nBitsPerElem == 20)
|
||||
check(bf2.mBits == 200000)
|
||||
check(bf2.useMurmurHash == true)
|
||||
test "large scale performance":
|
||||
let largeSize = 1_000_000
|
||||
var standardBf = initializeBloomFilter(largeSize, 0.001, useExtendedHash = false)
|
||||
var extendedBf = initializeBloomFilter(largeSize, 0.001, useExtendedHash = true)
|
||||
|
||||
var largeData = newSeq[string](1000)
|
||||
for i in 0..<1000:
|
||||
largeData[i] = $i & "test" & $rand(1000000)
|
||||
|
||||
# Insert and measure false positives for both
|
||||
var startTime = cpuTime()
|
||||
for item in largeData:
|
||||
standardBf.insert(item)
|
||||
let standardTime = cpuTime() - startTime
|
||||
|
||||
startTime = cpuTime()
|
||||
for item in largeData:
|
||||
extendedBf.insert(item)
|
||||
let extendedTime = cpuTime() - startTime
|
||||
|
||||
echo "Standard hash large insert time: ", standardTime
|
||||
echo "Extended hash large insert time: ", extendedTime
|
||||
|
||||
test "error rate":
|
||||
var falsePositives = 0
|
||||
for i in 0..<nElementsToTest:
|
||||
var falsePositiveString = ""
|
||||
for j in 0..8: # By definition not in bf as 9 chars not 8
|
||||
for j in 0..8:
|
||||
falsePositiveString.add(sampleChars[rand(51)])
|
||||
if bf.lookup(falsePositiveString):
|
||||
falsePositives += 1
|
||||
|
||||
check falsePositives / nElementsToTest < bf.errorRate
|
||||
let actualErrorRate = falsePositives.float / nElementsToTest.float
|
||||
check actualErrorRate < bf.errorRate
|
||||
echo "Actual error rate: ", actualErrorRate
|
||||
echo "Target error rate: ", bf.errorRate
|
||||
|
||||
test "lookup errors":
|
||||
test "lookup reliability":
|
||||
var lookupErrors = 0
|
||||
let startTime = cpuTime()
|
||||
for i in 0..<nElementsToTest:
|
||||
if not bf.lookup(kTestElements[i]):
|
||||
lookupErrors += 1
|
||||
let endTime = cpuTime()
|
||||
|
||||
check lookupErrors == 0
|
||||
echo "Lookup time for ", nElementsToTest, " items: ", formatFloat(endTime - startTime, format = ffDecimal, precision = 4), " seconds"
|
||||
|
||||
# Finally test correct k / mOverN specification,
|
||||
test "k/(m/n) spec":
|
||||
test "k/(m/n) specification":
|
||||
expect(BloomFilterError):
|
||||
discard getMOverNBitsForK(k = 2, targetError = 0.00001)
|
||||
|
||||
@ -98,5 +197,23 @@ suite "bloom":
|
||||
check getMOverNBitsForK(k = 7, targetError = 0.01) == 10
|
||||
check getMOverNBitsForK(k = 7, targetError = 0.001) == 16
|
||||
|
||||
var bf3 = initializeBloomFilter(1000, 0.01, k = 4)
|
||||
check bf3.nBitsPerElem == 11
|
||||
test "force params":
|
||||
var bf2 = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20)
|
||||
check(bf2.capacity == 10000)
|
||||
check(bf2.errorRate == 0.001)
|
||||
check(bf2.kHashes == 4)
|
||||
check(bf2.mBits == 200000)
|
||||
|
||||
test "init error cases":
|
||||
expect(BloomFilterError):
|
||||
discard initializeBloomFilter(1000, 0.00001, k = 2)
|
||||
|
||||
expect(BloomFilterError):
|
||||
discard initializeBloomFilter(1000, 0.00001, k = 13)
|
||||
|
||||
test "string representation":
|
||||
let bf3 = initializeBloomFilter(1000, 0.01, k = 4)
|
||||
let str = $bf3
|
||||
check str.contains("1000")
|
||||
check str.contains("4 hash functions")
|
||||
check str.contains("1.0e-02") # 0.01 in scientific notation
|
||||
Loading…
x
Reference in New Issue
Block a user