feat: add bloom filter (#3)

2026-01-02 14:13:07 +00:00 · 2025-01-13 13:49:28 +04:00 · 2025-01-13 13:49:28 +04:00 · 5df71ad3ea
commit 5df71ad3ea
parent a83dcc0331
4 changed files with 369 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
 nimcache
 nimcache/*
 tests/bloom
 nim-bloom/bloom
 .DS_Store
 src/.DS_Store
--- a/src/bloom.nim
+++ b/src/bloom.nim
@ -0,0 +1,123 @@
 from math import ceil, ln, pow, round
 import hashes
 import strutils
 import results
 import private/probabilities
 type
  BloomFilter* = object
    capacity*: int
    errorRate*: float
    kHashes*: int
    mBits*: int
    intArray: seq[int]
 {.push overflowChecks: off.}  # Turn off overflow checks for hashing operations
 proc hashN(item: string, n: int, maxValue: int): int =
  ## Get the nth hash using Nim's built-in hash function using
  ## the double hashing technique from Kirsch and Mitzenmacher, 2008:
  ## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
  let
    hashA = abs(hash(item)) mod maxValue  # Use abs to handle negative hashes
    hashB = abs(hash(item & " b")) mod maxValue # string concatenation
  abs((hashA + n * hashB)) mod maxValue
  #   # Use bit rotation for second hash instead of string concatenation if speed if preferred over FP-rate
  #   # Rotate left by 21 bits (lower the rotation, higher the speed but higher the FP-rate too) 
  #   hashB = abs(
  #     ((h shl 21) or (h shr (sizeof(int) * 8 - 21)))
  #   ) mod maxValue
  # abs((hashA + n.int64 * hashB)) mod maxValue
 {.pop.}
 proc getMOverNBitsForK*(k: int, targetError: float,
    probabilityTable = kErrors): Result[int, string] =
  ## Returns the optimal number of m/n bits for a given k.
  if k notin 0..12:
    return err("K must be <= 12 if forceNBitsPerElem is not also specified.")
  for mOverN in 2..probabilityTable[k].high:
    if probabilityTable[k][mOverN] < targetError:
      return ok(mOverN)
  err("Specified value of k and error rate not achievable using less than 4 bytes / element.")
 proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
                              forceNBitsPerElem = 0): Result[BloomFilter, string] =
  ## Initializes a Bloom filter with specified parameters.
  ##
  ## Parameters:
  ## - capacity: Expected number of elements to be inserted
  ## - errorRate: Desired false positive rate (e.g., 0.01 for 1%)
  ## - k: Optional number of hash functions. If 0, calculated optimally
  ## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
  ## useful tables on k and m/n (n bits per element) combinations.
  ## - forceNBitsPerElem: Optional override for bits per element
  var
    kHashes: int
    nBitsPerElem: int
  if k < 1: # Calculate optimal k and use that
    let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
    kHashes = round(ln(2.float) * bitsPerElem).int
    nBitsPerElem = round(bitsPerElem).int
  else: # Use specified k if possible
    if forceNBitsPerElem < 1: # Use lookup table
      let mOverNRes = getMOverNBitsForK(k = k, targetError = errorRate)
      if mOverNRes.isErr:
        return err(mOverNRes.error)
      nBitsPerElem = mOverNRes.value
    else:
      nBitsPerElem = forceNBitsPerElem
    kHashes = k
  let
    mBits = capacity * nBitsPerElem
    mInts = 1 + mBits div (sizeof(int) * 8)
  ok(BloomFilter(
    capacity: capacity,
    errorRate: errorRate,
    kHashes: kHashes,
    mBits: mBits,
    intArray: newSeq[int](mInts)
  ))
 proc `$`*(bf: BloomFilter): string =
  ## Prints the configuration of the Bloom filter.
  "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." %
    [$bf.capacity,
     formatFloat(bf.errorRate, format = ffScientific, precision = 1),
     $bf.kHashes,
     $(bf.mBits div bf.capacity)]
 proc computeHashes(bf: BloomFilter, item: string): seq[int] =
  var hashes = newSeq[int](bf.kHashes)
  for i in 0..<bf.kHashes:
    hashes[i] = hashN(item, i, bf.mBits)
  hashes
 proc insert*(bf: var BloomFilter, item: string) =
  ## Insert an item (string) into the Bloom filter.
  let hashSet = bf.computeHashes(item)
  for h in hashSet:
    let
      intAddress = h div (sizeof(int) * 8)
      bitOffset = h mod (sizeof(int) * 8)
    bf.intArray[intAddress] = bf.intArray[intAddress] or (1 shl bitOffset)
 proc lookup*(bf: BloomFilter, item: string): bool =
  ## Lookup an item (string) in the Bloom filter.
  ## If the item is present, ``lookup`` is guaranteed to return ``true``.
  ## If the item is not present, ``lookup`` will return ``false``
  ## with a probability 1 - ``bf.errorRate``.
  let hashSet = bf.computeHashes(item)
  for h in hashSet:
    let
      intAddress = h div (sizeof(int) * 8)
      bitOffset = h mod (sizeof(int) * 8)
      currentInt = bf.intArray[intAddress]
    if currentInt != (currentInt or (1 shl bitOffset)):
      return false
  true
--- a/src/private/probabilities.nim
+++ b/src/private/probabilities.nim
@ -0,0 +1,98 @@
 #
 # ### Probability table declaration, in private/ for readability ###
 # Table for k hashes from 1..12 from http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html
 # Iterate along the sequence at position [k] until the error rate is < specified, otherwise
 # raise an error.
 #
 type
  TErrorForK = seq[float]
  TAllErrorRates* = array[0..12, TErrorForK]
 let kErrors*: TAllErrorRates = [
  @[1.0],
  @[1.0, 1.0, 0.3930000000, 0.2830000000, 0.2210000000, 0.1810000000,
      0.1540000000, 0.1330000000, 0.1180000000, 0.1050000000, 0.0952000000,
      0.0869000000, 0.0800000000, 0.0740000000, 0.0689000000, 0.0645000000,
      0.0606000000, 0.0571000000, 0.0540000000, 0.0513000000, 0.0488000000,
      0.0465000000, 0.0444000000, 0.0425000000, 0.0408000000, 0.0392000000,
      0.0377000000, 0.0364000000, 0.0351000000, 0.0339000000, 0.0328000000,
      0.0317000000, 0.0308000000],
  @[1.0, 1.0, 0.4000000000, 0.2370000000, 0.1550000000, 0.1090000000,
      0.0804000000, 0.0618000000, 0.0489000000, 0.0397000000, 0.0329000000,
      0.0276000000, 0.0236000000, 0.0203000000, 0.0177000000, 0.0156000000,
      0.0138000000, 0.0123000000, 0.0111000000, 0.0099800000, 0.0090600000,
      0.0082500000, 0.0075500000, 0.0069400000, 0.0063900000, 0.0059100000,
      0.0054800000, 0.0051000000, 0.0047500000, 0.0044400000, 0.0041600000,
      0.0039000000, 0.0036700000],
  @[1.0, 1.0, 1.0, 0.2530000000, 0.1470000000, 0.0920000000, 0.0609000000,
      0.0423000000, 0.0306000000, 0.0228000000, 0.0174000000, 0.0136000000,
      0.0108000000, 0.0087500000, 0.0071800000, 0.0059600000, 0.0050000000,
      0.0042300000, 0.0036200000, 0.0031200000, 0.0027000000, 0.0023600000,
      0.0020700000, 0.0018300000, 0.0016200000, 0.0014500000, 0.0012900000,
      0.0011600000, 0.0010500000, 0.0009490000, 0.0008620000, 0.0007850000,
      0.0007170000],
  @[1.0, 1.0, 1.0, 1.0, 0.1600000000, 0.0920000000, 0.0561000000, 0.0359000000,
      0.0240000000, 0.0166000000, 0.0118000000, 0.0086400000, 0.0064600000,
      0.0049200000, 0.0038100000, 0.0030000000, 0.0023900000, 0.0019300000,
      0.0015800000, 0.0013000000, 0.0010800000, 0.0009050000, 0.0007640000,
      0.0006490000, 0.0005550000, 0.0004780000, 0.0004130000, 0.0003590000,
      0.0003140000, 0.0002760000, 0.0002430000, 0.0002150000, 0.0001910000],
  @[1.0, 1.0, 1.0, 1.0, 1.0, 0.1010000000, 0.0578000000, 0.0347000000,
      0.0217000000, 0.0141000000, 0.0094300000, 0.0065000000, 0.0045900000,
      0.0033200000, 0.0024400000, 0.0018300000, 0.0013900000, 0.0010700000,
      0.0008390000, 0.0006630000, 0.0005300000, 0.0004270000, 0.0003470000,
      0.0002850000, 0.0002350000, 0.0001960000, 0.0001640000, 0.0001380000,
      0.0001170000, 0.0000996000, 0.0000853000, 0.0000733000, 0.0000633000],
  @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0638000000, 0.0364000000, 0.0216000000,
      0.0133000000, 0.0084400000, 0.0055200000, 0.0037100000, 0.0025500000,
      0.0017900000, 0.0012800000, 0.0009350000, 0.0006920000, 0.0005190000,
      0.0003940000, 0.0003030000, 0.0002360000, 0.0001850000, 0.0001470000,
      0.0001170000, 0.0000944000, 0.0000766000, 0.0000626000, 0.0000515000,
      0.0000426000, 0.0000355000, 0.0000297000, 0.0000250000],
  @[1.0, 1.0, 1.0,
      1.0, 1.0, 1.0, 1.0, 1.0, 0.0229000000, 0.0135000000, 0.0081900000,
      0.0051300000, 0.0032900000, 0.0021700000, 0.0014600000, 0.0010000000,
      0.0007020000, 0.0004990000, 0.0003600000, 0.0002640000, 0.0001960000,
      0.0001470000, 0.0001120000, 0.0000856000, 0.0000663000, 0.0000518000,
      0.0000408000, 0.0000324000, 0.0000259000, 0.0000209000, 0.0000169000,
      0.0000138000, 0.0000113000],
  @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
      1.0, 0.0145000000, 0.0084600000, 0.0050900000, 0.0031400000, 0.0019900000,
      0.0012900000, 0.0008520000, 0.0005740000, 0.0003940000, 0.0002750000,
      0.0001940000, 0.0001400000, 0.0001010000, 0.0000746000, 0.0000555000,
      0.0000417000, 0.0000316000, 0.0000242000, 0.0000187000, 0.0000146000,
      0.0000114000, 0.0000090100, 0.0000071600, 0.0000057300],
  @[1.0, 1.0, 1.0,
      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0053100000, 0.0031700000,
      0.0019400000, 0.0012100000, 0.0007750000, 0.0005050000, 0.0003350000,
      0.0002260000, 0.0001550000, 0.0001080000, 0.0000759000, 0.0000542000,
      0.0000392000, 0.0000286000, 0.0000211000, 0.0000157000, 0.0000118000,
      0.0000089600, 0.0000068500, 0.0000052800, 0.0000041000, 0.0000032000],
  @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0033400000,
      0.0019800000, 0.0012000000, 0.0007440000, 0.0004700000, 0.0003020000,
      0.0001980000, 0.0001320000, 0.0000889000, 0.0000609000, 0.0000423000,
      0.0000297000, 0.0000211000, 0.0000152000, 0.0000110000, 0.0000080700,
      0.0000059700, 0.0000044500, 0.0000033500, 0.0000025400, 0.0000019400],
  @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
      0.0021000000, 0.0012400000, 0.0007470000, 0.0004590000, 0.0002870000,
      0.0001830000, 0.0001180000, 0.0000777000, 0.0000518000, 0.0000350000,
      0.0000240000, 0.0000166000, 0.0000116000, 0.0000082300, 0.0000058900,
      0.0000042500, 0.0000031000, 0.0000022800, 0.0000016900, 0.0000012600],
  @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
      0.0007780000, 0.0004660000, 0.0002840000, 0.0001760000, 0.0001110000,
      0.0000712000, 0.0000463000, 0.0000305000, 0.0000204000, 0.0000138000,
      0.0000094200, 0.0000065200, 0.0000045600, 0.0000032200, 0.0000022900,
      0.0000016500, 0.0000012000, 0.0000008740]
 ]
--- a/tests/test_bloom.nim
+++ b/tests/test_bloom.nim
@ -0,0 +1,142 @@
 import unittest, results, strutils
 import ../src/bloom
 from random import rand, randomize
 suite "bloom filter":
  setup:
    let nElementsToTest = 10000
    let bfResult = initializeBloomFilter(capacity = nElementsToTest, errorRate = 0.001)
    check bfResult.isOk
    var bf = bfResult.get
    randomize(2882) # Seed the RNG
    var
      sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
      testElements = newSeq[string](nElementsToTest)
    for i in 0..<nElementsToTest:
      var newString = ""
      for j in 0..7:
        newString.add(sampleChars[rand(51)])
      testElements[i] = newString
    for item in testElements:
      bf.insert(item)
  test "initialization parameters":
    check bf.capacity == nElementsToTest
    check bf.errorRate == 0.001
    check bf.kHashes == 10
    check bf.mBits div bf.capacity == 15  # bits per element
  test "basic operations":
    check bf.lookup("nonexistent") == false  # Test empty lookup
    let bf2Result = initializeBloomFilter(100, 0.01)
    check bf2Result.isOk
    var bf2 = bf2Result.get
    bf2.insert("test string")
    check bf2.lookup("test string") == true
    check bf2.lookup("different string") == false
  test "error rate":
    var falsePositives = 0
    let testSize = nElementsToTest div 2
    for i in 0..<testSize:
      var testString = ""
      for j in 0..8:  # Different length than setup
        testString.add(sampleChars[rand(51)])
      if bf.lookup(testString):
        falsePositives.inc()
    let actualErrorRate = falsePositives.float / testSize.float
    check actualErrorRate < bf.errorRate * 1.5  # Allow some margin
  test "perfect recall":
    var lookupErrors = 0
    for item in testElements:
      if not bf.lookup(item):
        lookupErrors.inc()
    check lookupErrors == 0
  test "k/m bits specification":
    # Test error case for k > 12
    let errorCase = getMOverNBitsForK(k = 13, targetError = 0.01)
    check errorCase.isErr
    check errorCase.error == "K must be <= 12 if forceNBitsPerElem is not also specified."
    # Test error case for unachievable error rate
    let errorCase2 = getMOverNBitsForK(k = 2, targetError = 0.00001)
    check errorCase2.isErr
    check errorCase2.error == "Specified value of k and error rate not achievable using less than 4 bytes / element."
    # Test success cases
    let case1 = getMOverNBitsForK(k = 2, targetError = 0.1)
    check case1.isOk
    check case1.value == 6
    let case2 = getMOverNBitsForK(k = 7, targetError = 0.01)
    check case2.isOk
    check case2.value == 10
    let case3 = getMOverNBitsForK(k = 7, targetError = 0.001)
    check case3.isOk
    check case3.value == 16
    let bf2Result = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20)
    check bf2Result.isOk
    let bf2 = bf2Result.get
    check bf2.kHashes == 4
    check bf2.mBits == 200000
  test "string representation":
    let bf3Result = initializeBloomFilter(1000, 0.01, k = 4)
    check bf3Result.isOk
    let bf3 = bf3Result.get
    let str = $bf3
    check str.contains("1000")  # Capacity
    check str.contains("4 hash")  # Hash functions
    check str.contains("1.0e-02")  # Error rate in scientific notation
 suite "bloom filter special cases":
  test "different patterns of strings":
    const testSize = 10_000
    let patterns = @[
      "shortstr",
      repeat("a", 1000),  # Very long string
      "special@#$%^&*()",  # Special characters
      "unicode→★∑≈",  # Unicode characters
      repeat("pattern", 10)  # Repeating pattern
    ]
    let bfResult = initializeBloomFilter(testSize, 0.01)
    check bfResult.isOk
    var bf = bfResult.get
    var inserted = newSeq[string](testSize)
    # Test pattern handling
    for pattern in patterns:
      bf.insert(pattern)
      assert bf.lookup(pattern), "failed lookup pattern: " & pattern
    # Test general insertion and lookup
    for i in 0..<testSize:
      inserted[i] = $i & "test" & $rand(1000)
      bf.insert(inserted[i])
    # Verify all insertions
    var lookupErrors = 0
    for item in inserted:
      if not bf.lookup(item):
        lookupErrors.inc()
    check lookupErrors == 0
    # Check false positive rate
    var falsePositives = 0
    let fpTestSize = testSize div 2
    for i in 0..<fpTestSize:
      let testItem = "notpresent" & $i & $rand(1000)
      if bf.lookup(testItem):
        falsePositives.inc()
    let fpRate = falsePositives.float / fpTestSize.float
    check fpRate < bf.errorRate * 1.5  # Allow some margin but should be close to target