nim-sds/src/bloom.nim

from math import ceil, ln, pow, round
import hashes
import strutils
import results
import private/probabilities

type BloomFilter* = object
  capacity*: int
  errorRate*: float
  kHashes*: int
  mBits*: int
  intArray*: seq[int]

{.push overflowChecks: off.} # Turn off overflow checks for hashing operations

proc hashN(item: string, n: int, maxValue: int): int =
  ## Get the nth hash using Nim's built-in hash function using
  ## the double hashing technique from Kirsch and Mitzenmacher, 2008:
  ## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
  let
    hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes
    hashB = abs(hash(item & " b")) mod maxValue # string concatenation
  abs((hashA + n * hashB)) mod maxValue
  #   # Use bit rotation for second hash instead of string concatenation if speed if preferred over FP-rate
  #   # Rotate left by 21 bits (lower the rotation, higher the speed but higher the FP-rate too)
  #   hashB = abs(
  #     ((h shl 21) or (h shr (sizeof(int) * 8 - 21)))
  #   ) mod maxValue
  # abs((hashA + n.int64 * hashB)) mod maxValue

{.pop.}

proc getMOverNBitsForK*(
    k: int, targetError: float, probabilityTable = kErrors
): Result[int, string] =
  ## Returns the optimal number of m/n bits for a given k.
  if k notin 0 .. 12:
    return err("K must be <= 12 if forceNBitsPerElem is not also specified.")

  for mOverN in 2 .. probabilityTable[k].high:
    if probabilityTable[k][mOverN] < targetError:
      return ok(mOverN)

  err(
    "Specified value of k and error rate not achievable using less than 4 bytes / element."
  )

proc initializeBloomFilter*(
    capacity: int, errorRate: float, k = 0, forceNBitsPerElem = 0
): Result[BloomFilter, string] =
  ## Initializes a Bloom filter with specified parameters.
  ##
  ## Parameters:
  ## - capacity: Expected number of elements to be inserted
  ## - errorRate: Desired false positive rate (e.g., 0.01 for 1%)
  ## - k: Optional number of hash functions. If 0, calculated optimally
  ## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
  ## useful tables on k and m/n (n bits per element) combinations.
  ## - forceNBitsPerElem: Optional override for bits per element
  var
    kHashes: int
    nBitsPerElem: int

  if k < 1: # Calculate optimal k and use that
    let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
    kHashes = round(ln(2.float) * bitsPerElem).int
    nBitsPerElem = round(bitsPerElem).int
  else: # Use specified k if possible
    if forceNBitsPerElem < 1: # Use lookup table
      let mOverNRes = getMOverNBitsForK(k = k, targetError = errorRate)
      if mOverNRes.isErr:
        return err(mOverNRes.error)
      nBitsPerElem = mOverNRes.value
    else:
      nBitsPerElem = forceNBitsPerElem
    kHashes = k

  let
    mBits = capacity * nBitsPerElem
    mInts = 1 + mBits div (sizeof(int) * 8)

  ok(
    BloomFilter(
      capacity: capacity,
      errorRate: errorRate,
      kHashes: kHashes,
      mBits: mBits,
      intArray: newSeq[int](mInts),
    )
  )

proc `$`*(bf: BloomFilter): string =
  ## Prints the configuration of the Bloom filter.
  "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." %
  [
    $bf.capacity,
    formatFloat(bf.errorRate, format = ffScientific, precision = 1),
    $bf.kHashes,
    $(bf.mBits div bf.capacity),
  ]

proc computeHashes(bf: BloomFilter, item: string): seq[int] =
  var hashes = newSeq[int](bf.kHashes)
  for i in 0 ..< bf.kHashes:
    hashes[i] = hashN(item, i, bf.mBits)
  hashes

proc insert*(bf: var BloomFilter, item: string) =
  ## Insert an item (string) into the Bloom filter.
  let hashSet = bf.computeHashes(item)
  for h in hashSet:
    let
      intAddress = h div (sizeof(int) * 8)
      bitOffset = h mod (sizeof(int) * 8)
    bf.intArray[intAddress] = bf.intArray[intAddress] or (1 shl bitOffset)

proc lookup*(bf: BloomFilter, item: string): bool =
  ## Lookup an item (string) in the Bloom filter.
  ## If the item is present, ``lookup`` is guaranteed to return ``true``.
  ## If the item is not present, ``lookup`` will return ``false``
  ## with a probability 1 - ``bf.errorRate``.
  let hashSet = bf.computeHashes(item)
  for h in hashSet:
    let
      intAddress = h div (sizeof(int) * 8)
      bitOffset = h mod (sizeof(int) * 8)
      currentInt = bf.intArray[intAddress]
    if currentInt != (currentInt or (1 shl bitOffset)):
      return false
  true