nim-sds/sds/bloom.nim

from math import ceil, ln, pow, round
import hashes
import strutils
import results
import private/probabilities
import ./types/bloom_filter
export bloom_filter

{.push overflowChecks: off.} # Turn off overflow checks for hashing operations

proc hashN(item: string, n: int, maxValue: int): int =
  ## Get the nth hash using Nim's built-in hash function using
  ## the double hashing technique from Kirsch and Mitzenmacher, 2008:
  ## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
  let
    hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes
    hashB = abs(hash(item & " b")) mod maxValue # string concatenation
  return abs((hashA + n * hashB)) mod maxValue

{.pop.}

proc getMOverNBitsForK*(
    k: int, targetError: float, probabilityTable = kErrors
): Result[int, string] =
  ## Returns the optimal number of m/n bits for a given k.
  if k notin 0 .. 12:
    return err("K must be <= 12 if forceNBitsPerElem is not also specified.")

  for mOverN in 2 .. probabilityTable[k].high:
    if probabilityTable[k][mOverN] < targetError:
      return ok(mOverN)

  return err(
    "Specified value of k and error rate not achievable using less than 4 bytes / element."
  )

proc initializeBloomFilter*(
    capacity: int, errorRate: float, k = 0, forceNBitsPerElem = 0
): Result[BloomFilter, string] =
  ## Initializes a Bloom filter with specified parameters.
  ##
  ## Parameters:
  ## - capacity: Expected number of elements to be inserted
  ## - errorRate: Desired false positive rate (e.g., 0.01 for 1%)
  ## - k: Optional number of hash functions. If 0, calculated optimally
  ## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
  ## useful tables on k and m/n (n bits per element) combinations.
  ## - forceNBitsPerElem: Optional override for bits per element
  var
    kHashes: int
    nBitsPerElem: int

  if k < 1: # Calculate optimal k and use that
    let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
    kHashes = round(ln(2.float) * bitsPerElem).int
    nBitsPerElem = round(bitsPerElem).int
  else: # Use specified k if possible
    if forceNBitsPerElem < 1: # Use lookup table
      let mOverNRes = getMOverNBitsForK(k = k, targetError = errorRate)
      if mOverNRes.isErr:
        return err(mOverNRes.error)
      nBitsPerElem = mOverNRes.value
    else:
      nBitsPerElem = forceNBitsPerElem
    kHashes = k

  let
    mBits = capacity * nBitsPerElem
    mInts = 1 + mBits div (sizeof(int) * 8)

  return ok(
    BloomFilter.init(
      capacity = capacity,
      errorRate = errorRate,
      kHashes = kHashes,
      mBits = mBits,
      intArray = newSeq[int](mInts),
    )
  )

proc `$`*(bf: BloomFilter): string =
  ## Prints the configuration of the Bloom filter.
  return
    "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." %
    [
      $bf.capacity,
      formatFloat(bf.errorRate, format = ffScientific, precision = 1),
      $bf.kHashes,
      $(bf.mBits div bf.capacity),
    ]

proc computeHashes(bf: BloomFilter, item: string): seq[int] =
  var hashes = newSeq[int](bf.kHashes)
  for i in 0 ..< bf.kHashes:
    hashes[i] = hashN(item, i, bf.mBits)
  return hashes

proc insert*(bf: var BloomFilter, item: string) =
  ## Insert an item (string) into the Bloom filter.
  let hashSet = bf.computeHashes(item)
  for h in hashSet:
    let
      intAddress = h div (sizeof(int) * 8)
      bitOffset = h mod (sizeof(int) * 8)
    bf.intArray[intAddress] = bf.intArray[intAddress] or (1 shl bitOffset)

proc lookup*(bf: BloomFilter, item: string): bool =
  ## Lookup an item (string) in the Bloom filter.
  ## If the item is present, ``lookup`` is guaranteed to return ``true``.
  ## If the item is not present, ``lookup`` will return ``false``
  ## with a probability 1 - ``bf.errorRate``.
  let hashSet = bf.computeHashes(item)
  for h in hashSet:
    let
      intAddress = h div (sizeof(int) * 8)
      bitOffset = h mod (sizeof(int) * 8)
      currentInt = bf.intArray[intAddress]
    if currentInt != (currentInt or (1 shl bitOffset)):
      return false
  return true
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00			`from math import ceil, ln, pow, round`
			`import hashes`
			`import strutils`
			`import results`
			`import private/probabilities`
generic refactor to make the code more aligned to logos-delivery style (#62) * generic refactor to make the code more aligned to logos-delivery style * use explicit return statement 2026-04-24 09:50:18 +02:00			`import ./types/bloom_filter`
			`export bloom_filter`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00
feat: add rolling bloom filter, reliability utils and protobuf (#4) 2025-02-11 13:23:19 +05:30			`{.push overflowChecks: off.} # Turn off overflow checks for hashing operations`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00
			`proc hashN(item: string, n: int, maxValue: int): int =`
			`## Get the nth hash using Nim's built-in hash function using`
			`## the double hashing technique from Kirsch and Mitzenmacher, 2008:`
			`## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf`
			`let`
feat: add rolling bloom filter, reliability utils and protobuf (#4) 2025-02-11 13:23:19 +05:30			`hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00			`hashB = abs(hash(item & " b")) mod maxValue # string concatenation`
generic refactor to make the code more aligned to logos-delivery style (#62) * generic refactor to make the code more aligned to logos-delivery style * use explicit return statement 2026-04-24 09:50:18 +02:00			`return abs((hashA + n * hashB)) mod maxValue`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00
			`{.pop.}`

feat: add rolling bloom filter, reliability utils and protobuf (#4) 2025-02-11 13:23:19 +05:30			`proc getMOverNBitsForK*(`
			`k: int, targetError: float, probabilityTable = kErrors`
			`): Result[int, string] =`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00			`## Returns the optimal number of m/n bits for a given k.`
feat: add rolling bloom filter, reliability utils and protobuf (#4) 2025-02-11 13:23:19 +05:30			`if k notin 0 .. 12:`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00			`return err("K must be <= 12 if forceNBitsPerElem is not also specified.")`

feat: add rolling bloom filter, reliability utils and protobuf (#4) 2025-02-11 13:23:19 +05:30			`for mOverN in 2 .. probabilityTable[k].high:`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00			`if probabilityTable[k][mOverN] < targetError:`
			`return ok(mOverN)`

generic refactor to make the code more aligned to logos-delivery style (#62) * generic refactor to make the code more aligned to logos-delivery style * use explicit return statement 2026-04-24 09:50:18 +02:00			`return err(`
feat: add rolling bloom filter, reliability utils and protobuf (#4) 2025-02-11 13:23:19 +05:30			`"Specified value of k and error rate not achievable using less than 4 bytes / element."`
			`)`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00
feat: add rolling bloom filter, reliability utils and protobuf (#4) 2025-02-11 13:23:19 +05:30			`proc initializeBloomFilter*(`
			`capacity: int, errorRate: float, k = 0, forceNBitsPerElem = 0`
			`): Result[BloomFilter, string] =`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00			`## Initializes a Bloom filter with specified parameters.`
			`##`
			`## Parameters:`
			`## - capacity: Expected number of elements to be inserted`
			`## - errorRate: Desired false positive rate (e.g., 0.01 for 1%)`
			`## - k: Optional number of hash functions. If 0, calculated optimally`
			`## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for`
			`## useful tables on k and m/n (n bits per element) combinations.`
			`## - forceNBitsPerElem: Optional override for bits per element`
			`var`
			`kHashes: int`
			`nBitsPerElem: int`

			`if k < 1: # Calculate optimal k and use that`
			`let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))`
			`kHashes = round(ln(2.float) * bitsPerElem).int`
			`nBitsPerElem = round(bitsPerElem).int`
			`else: # Use specified k if possible`
			`if forceNBitsPerElem < 1: # Use lookup table`
			`let mOverNRes = getMOverNBitsForK(k = k, targetError = errorRate)`
			`if mOverNRes.isErr:`
			`return err(mOverNRes.error)`
			`nBitsPerElem = mOverNRes.value`
			`else:`
			`nBitsPerElem = forceNBitsPerElem`
			`kHashes = k`

			`let`
			`mBits = capacity * nBitsPerElem`
			`mInts = 1 + mBits div (sizeof(int) * 8)`

generic refactor to make the code more aligned to logos-delivery style (#62) * generic refactor to make the code more aligned to logos-delivery style * use explicit return statement 2026-04-24 09:50:18 +02:00			`return ok(`
			`BloomFilter.init(`
			`capacity = capacity,`
			`errorRate = errorRate,`
			`kHashes = kHashes,`
			`mBits = mBits,`
			`intArray = newSeq[int](mInts),`
feat: add rolling bloom filter, reliability utils and protobuf (#4) 2025-02-11 13:23:19 +05:30			`)`
			`)`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00
			proc `$`*(bf: BloomFilter): string =
			`## Prints the configuration of the Bloom filter.`
feat: make Persistence interface async (#69) * feat: make Persistence interface async The 14 Persistence proc fields now return Future[...] with {.async: (raises: []), gcsafe.}, allowing real I/O backends (SQLite, encrypted file, network) to suspend rather than block the Chronos event loop the manager runs on. Propagates through: - ReliabilityManager.lock: system.Lock -> chronos.AsyncLock. Acquired across awaits cleanly; matches the single-threaded Chronos worker the FFI uses. Multi-OS-thread use is now explicitly the caller's responsibility. - sds_utils + sds.nim public API procs (wrapOutgoingMessage, unwrapReceivedMessage, markDependenciesMet, setCallbacks, resetReliabilityManager, cleanup, ensureChannel, removeChannel, the getter snapshots, etc.) are now async. - FFI request handlers in library/sds_thread/... await the new API. - Tests converted via an asyncTest template that wraps each test body in an async proc; setup/teardown use waitFor for their single async call (ensureChannel / cleanup). Lock scope is preserved exactly: the same call sites that held the kernel Lock today hold AsyncLock now -- no new locking added. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * refactor: drop asyncSpawn, add asyncSetup/asyncTeardown Three asyncSpawn usages removed: - sds.nim startPeriodicTasks: stored the periodic-task futures on ReliabilityManager (new field `periodicTasks: seq[FutureBase]`) so cleanup can cancel them on shutdown instead of leaking the loops against a cleared manager. - library/sds_thread/sds_thread.nim: fireSync moved BEFORE processing, then `await SdsThreadRequest.process(...)` instead of asyncSpawn'ing it. Aligns the worker with the SP-channel + lock assumption that there are no concurrent requests; caller throughput is unchanged because the caller only waits for receipt (fireSync), not processing. - tests TestBus repair callback: replaced asyncSpawn(deliverExcept...) with an explicit pending-delivery queue drained by `bus.drain()`. Integration tests no longer rely on `sleepAsync(10ms)` to let spawned deliveries finish — they await drain instead. Tests also pick up an asyncSetup/asyncTeardown pair (tests/async_unittest.nim) so suite fixtures can `await` directly. All `waitFor` in setup/teardown blocks is gone; only the top-level asyncTest wrapper still uses waitFor (once, to drive the async proc to completion). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * Correctly propagate error hidden by new async move * Correctly handle future cancellation exceptions, +some housekeeping * Apply suggestion from @Ivansete-status Co-authored-by: Ivan FB <128452529+Ivansete-status@users.noreply.github.com> * Stylistics, async default implication addressed, nph style run * Remove leaking CancelledFuture from public facing + as a consequence it is tuneled into handling CatchableError everywhere --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com> Co-authored-by: Ivan FB <128452529+Ivansete-status@users.noreply.github.com> 2026-05-25 22:30:15 +02:00			`return`
			`"Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." %`
generic refactor to make the code more aligned to logos-delivery style (#62) * generic refactor to make the code more aligned to logos-delivery style * use explicit return statement 2026-04-24 09:50:18 +02:00			`[`
			`$bf.capacity,`
			`formatFloat(bf.errorRate, format = ffScientific, precision = 1),`
			`$bf.kHashes,`
			`$(bf.mBits div bf.capacity),`
			`]`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00
			`proc computeHashes(bf: BloomFilter, item: string): seq[int] =`
			`var hashes = newSeq[int](bf.kHashes)`
feat: add rolling bloom filter, reliability utils and protobuf (#4) 2025-02-11 13:23:19 +05:30			`for i in 0 ..< bf.kHashes:`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00			`hashes[i] = hashN(item, i, bf.mBits)`
generic refactor to make the code more aligned to logos-delivery style (#62) * generic refactor to make the code more aligned to logos-delivery style * use explicit return statement 2026-04-24 09:50:18 +02:00			`return hashes`
feat: add bloom filter (#3) 2025-01-13 13:49:28 +04:00
			`proc insert*(bf: var BloomFilter, item: string) =`
			`## Insert an item (string) into the Bloom filter.`
			`let hashSet = bf.computeHashes(item)`
			`for h in hashSet:`
			`let`
			`intAddress = h div (sizeof(int) * 8)`
			`bitOffset = h mod (sizeof(int) * 8)`
			`bf.intArray[intAddress] = bf.intArray[intAddress] or (1 shl bitOffset)`

			`proc lookup*(bf: BloomFilter, item: string): bool =`
			`## Lookup an item (string) in the Bloom filter.`
			## If the item is present, ``lookup`` is guaranteed to return ``true``.
			## If the item is not present, ``lookup`` will return ``false``
			## with a probability 1 - ``bf.errorRate``.
			`let hashSet = bf.computeHashes(item)`
			`for h in hashSet:`
			`let`
			`intAddress = h div (sizeof(int) * 8)`
			`bitOffset = h mod (sizeof(int) * 8)`
			`currentInt = bf.intArray[intAddress]`
			`if currentInt != (currentInt or (1 shl bitOffset)):`
			`return false`
generic refactor to make the code more aligned to logos-delivery style (#62) * generic refactor to make the code more aligned to logos-delivery style * use explicit return statement 2026-04-24 09:50:18 +02:00			`return true`