diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cfc9510 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +nimcache +nimcache/* +tests/bloom +nim-bloom/bloom +.DS_Store +src/.DS_Store \ No newline at end of file diff --git a/src/bloom.nim b/src/bloom.nim new file mode 100644 index 0000000..92b0712 --- /dev/null +++ b/src/bloom.nim @@ -0,0 +1,123 @@ +from math import ceil, ln, pow, round +import hashes +import strutils +import results +import private/probabilities + +type + BloomFilter* = object + capacity*: int + errorRate*: float + kHashes*: int + mBits*: int + intArray: seq[int] + +{.push overflowChecks: off.} # Turn off overflow checks for hashing operations + +proc hashN(item: string, n: int, maxValue: int): int = + ## Get the nth hash using Nim's built-in hash function using + ## the double hashing technique from Kirsch and Mitzenmacher, 2008: + ## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf + let + hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes + hashB = abs(hash(item & " b")) mod maxValue # string concatenation + abs((hashA + n * hashB)) mod maxValue + # # Use bit rotation for second hash instead of string concatenation if speed if preferred over FP-rate + # # Rotate left by 21 bits (lower the rotation, higher the speed but higher the FP-rate too) + # hashB = abs( + # ((h shl 21) or (h shr (sizeof(int) * 8 - 21))) + # ) mod maxValue + # abs((hashA + n.int64 * hashB)) mod maxValue + +{.pop.} + +proc getMOverNBitsForK*(k: int, targetError: float, + probabilityTable = kErrors): Result[int, string] = + ## Returns the optimal number of m/n bits for a given k. + if k notin 0..12: + return err("K must be <= 12 if forceNBitsPerElem is not also specified.") + + for mOverN in 2..probabilityTable[k].high: + if probabilityTable[k][mOverN] < targetError: + return ok(mOverN) + + err("Specified value of k and error rate not achievable using less than 4 bytes / element.") + +proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0, + forceNBitsPerElem = 0): Result[BloomFilter, string] = + ## Initializes a Bloom filter with specified parameters. + ## + ## Parameters: + ## - capacity: Expected number of elements to be inserted + ## - errorRate: Desired false positive rate (e.g., 0.01 for 1%) + ## - k: Optional number of hash functions. If 0, calculated optimally + ## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for + ## useful tables on k and m/n (n bits per element) combinations. + ## - forceNBitsPerElem: Optional override for bits per element + var + kHashes: int + nBitsPerElem: int + + if k < 1: # Calculate optimal k and use that + let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2)))) + kHashes = round(ln(2.float) * bitsPerElem).int + nBitsPerElem = round(bitsPerElem).int + else: # Use specified k if possible + if forceNBitsPerElem < 1: # Use lookup table + let mOverNRes = getMOverNBitsForK(k = k, targetError = errorRate) + if mOverNRes.isErr: + return err(mOverNRes.error) + nBitsPerElem = mOverNRes.value + else: + nBitsPerElem = forceNBitsPerElem + kHashes = k + + let + mBits = capacity * nBitsPerElem + mInts = 1 + mBits div (sizeof(int) * 8) + + ok(BloomFilter( + capacity: capacity, + errorRate: errorRate, + kHashes: kHashes, + mBits: mBits, + intArray: newSeq[int](mInts) + )) + +proc `$`*(bf: BloomFilter): string = + ## Prints the configuration of the Bloom filter. + "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." % + [$bf.capacity, + formatFloat(bf.errorRate, format = ffScientific, precision = 1), + $bf.kHashes, + $(bf.mBits div bf.capacity)] + +proc computeHashes(bf: BloomFilter, item: string): seq[int] = + var hashes = newSeq[int](bf.kHashes) + for i in 0.. 12 + let errorCase = getMOverNBitsForK(k = 13, targetError = 0.01) + check errorCase.isErr + check errorCase.error == "K must be <= 12 if forceNBitsPerElem is not also specified." + + # Test error case for unachievable error rate + let errorCase2 = getMOverNBitsForK(k = 2, targetError = 0.00001) + check errorCase2.isErr + check errorCase2.error == "Specified value of k and error rate not achievable using less than 4 bytes / element." + + # Test success cases + let case1 = getMOverNBitsForK(k = 2, targetError = 0.1) + check case1.isOk + check case1.value == 6 + + let case2 = getMOverNBitsForK(k = 7, targetError = 0.01) + check case2.isOk + check case2.value == 10 + + let case3 = getMOverNBitsForK(k = 7, targetError = 0.001) + check case3.isOk + check case3.value == 16 + + let bf2Result = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20) + check bf2Result.isOk + let bf2 = bf2Result.get + check bf2.kHashes == 4 + check bf2.mBits == 200000 + + test "string representation": + let bf3Result = initializeBloomFilter(1000, 0.01, k = 4) + check bf3Result.isOk + let bf3 = bf3Result.get + let str = $bf3 + check str.contains("1000") # Capacity + check str.contains("4 hash") # Hash functions + check str.contains("1.0e-02") # Error rate in scientific notation + +suite "bloom filter special cases": + test "different patterns of strings": + const testSize = 10_000 + let patterns = @[ + "shortstr", + repeat("a", 1000), # Very long string + "special@#$%^&*()", # Special characters + "unicode→★∑≈", # Unicode characters + repeat("pattern", 10) # Repeating pattern + ] + + let bfResult = initializeBloomFilter(testSize, 0.01) + check bfResult.isOk + var bf = bfResult.get + var inserted = newSeq[string](testSize) + + # Test pattern handling + for pattern in patterns: + bf.insert(pattern) + assert bf.lookup(pattern), "failed lookup pattern: " & pattern + + # Test general insertion and lookup + for i in 0..