From 8006af303ccc77b6a36640ee65a2401de9a91631 Mon Sep 17 00:00:00 2001
From: shash256 <111925100+shash256@users.noreply.github.com>
Date: Sat, 9 Nov 2024 11:16:24 +0530
Subject: [PATCH] feat: std vs custom hash selection, add more tests

chore: update readme and some fixes from review
---
 .gitignore               |   2 +
 nim-bloom/.gitignore     |   3 +-
 nim-bloom/README.md      | 106 +++++++++++++++------
 nim-bloom/src/bloom.nim  | 197 +++++++++------------------------------
 nim-bloom/tests/test.nim | 173 ++++++++++++++++++++++++++++------
 5 files changed, 274 insertions(+), 207 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9bea433
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+
+.DS_Store
diff --git a/nim-bloom/.gitignore b/nim-bloom/.gitignore
index 8d67039..a83eef1 100644
--- a/nim-bloom/.gitignore
+++ b/nim-bloom/.gitignore
@@ -4,4 +4,5 @@ tests/test
 bloom
 *.html
 *.css
-/.DS_Store
\ No newline at end of file
+.DS_Store
+src/.DS_Store
\ No newline at end of file
diff --git a/nim-bloom/README.md b/nim-bloom/README.md
index 339228a..a62c04d 100644
--- a/nim-bloom/README.md
+++ b/nim-bloom/README.md
@@ -1,41 +1,95 @@
-nim-bloom
-============
+# nim-bloom
 
-Bloom filter implementation in Nim. Uses a C implementation of MurmurHash3 for optimal speed and numeric distribution.
+A high-performance Bloom filter implementation in Nim. Supports both Nim's built-in MurmurHash2 (default) and an optional 128-bit MurmurHash3 implementation for large-scale use cases.
 
-On a 10 year old Macbook Pro Retina the test case for 10M insertions executes in ~4.0 seconds and 10M lookups in ~3.5 seconds for a Bloom filter with a 1 in 1000 error rate (0.001). This is ~2.5M insertions/sec and ~2.9M lookups/sec on a single thread (but passing the `-d:release` flag to the Nim compiler and thus activating the C compiler's optimizations). If k is lowered to 5 or 6 vs. a larger "optimal" number, performance further increases to ~4M ops/sec. Note that this test is for a Bloom filter ~20-25MB in size and thus accurately reflects the cost of main memory accesses (vs. a smaller filter that might fit solely in L3 cache, for example, and can achieve several million additional ops/sec).
+## Features
 
+- Fast string element insertion and lookup
+- Configurable error rates
+- Choice between standard Nim hash (MurmurHash2) and extended 128-bit MurmurHash3
+- Optimized for both small and large-scale use cases
+- Comprehensive test suite
 
-Currently supports inserting and looking up string elements. Forthcoming features include:
-* Support for other types beyond strings
-* Support for iterables in the insert method
-* Persistence
+## Performance
 
+Historical benchmark using MurmurHash3 implementation on a 10-year-old Macbook Pro Retina:
+- ~2.5M insertions/sec (~4.0 seconds for 10M insertions)
+- ~2.9M lookups/sec (~3.5 seconds for 10M lookups)
+- Test configuration: 0.001 error rate, Bloom filter size ~20-25MB
+- Compiled with `-d:release` flag
 
-quickstart
-====
-Quick functionality demo:
-```
+These numbers reflect performance outside of CPU cache, as the filter size was intentionally larger than L3 cache. Performance can be several million operations/sec higher with smaller filters that fit in cache.
+
+Current performance will vary based on:
+- Choice of hash function (standard Nim hash vs extended MurmurHash3)
+- Hardware specifications
+- Data size and memory access patterns
+- Compiler optimizations
+
+The default configuration (using Nim's built-in hash) is optimized for typical use cases, while the extended hash option (MurmurHash3) provides better collision resistance for large-scale applications at a slight performance cost.
+
+## Quickstart
+
+Basic usage:
+```nim
 import bloom
+
+# Initialize with default hash (suitable for most uses)
 var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.001)
-echo bf # Get characteristics of the Bloom filter
-echo bf.lookup("An element not in the Bloom filter")  # Prints 'false'
-bf.insert("Here we go...")
-assert(bf.lookup("Here we go..."))
+echo bf  # Print Bloom filter characteristics
+echo bf.lookup("test")  # false
+bf.insert("test")
+assert bf.lookup("test")  # true
+
+# For large-scale usage (>1M elements), consider using extended hash
+var largeBf = initializeBloomFilter(
+  capacity = 2_000_000,
+  errorRate = 0.001,
+  useExtendedHash = true
+)
 ```
 
+## Advanced Configuration
 
-By default, the Bloom filter will use a mathematically optimal number of k hash functions, which minimizes the amount of error per bit of storage required. In many cases, however, it may be advantageous to specify a smaller value of k in order to save time hashing. This is supported by passing an explicit `k` parameter, which will then either create an optimal Bloom filter for the specified error rate.[1]
+The Bloom filter can be configured in several ways:
 
-[1] If `k` <= 12 and the number of required bytes per element is <= 4. If either of these conditions doesn't hold, a fully manual Bloom filter can be constructed by passing both `k` and `force_n_bits_per_elem`.
-
-Example:
+1. Default initialization (automatically calculates optimal parameters):
+```nim
+var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.001)
 ```
-var bf2 = initializeBloomFilter(capacity = 10000, errorRate = 0.001, k = 5)
-assert bf2.kHashes == 5
-assert bf2.nBitsPerElem == 18
 
-var bf3 = initializeBloomFilter(capacity = 10000, errorRate = 0.001, k = 5, forceNBitsPerElem = 12)
-assert bf3.kHashes == 5
-assert bf3.nBitsPerElem == 12   # But note, however, that bf.errorRate will *not* be correct
+2. Specify custom number of hash functions:
+```nim
+var bf = initializeBloomFilter(
+  capacity = 10000,
+  errorRate = 0.001,
+  k = 5  # Use 5 hash functions instead of calculated optimal
+)
 ```
+
+3. Fully manual configuration:
+```nim
+var bf = initializeBloomFilter(
+  capacity = 10000,
+  errorRate = 0.001,
+  k = 5,
+  forceNBitsPerElem = 12,
+  useExtendedHash = false  # Use standard hash (default)
+)
+```
+
+Note: When specifying `k`, it must be ≤ 12 unless `forceNBitsPerElem` is also specified. The implementation will raise a `BloomFilterError` if parameters would result in suboptimal performance.
+
+## Hash Function Selection
+
+- Default: Uses Nim's built-in hash (MurmurHash2), suitable for most use cases
+- Extended: Uses 128-bit MurmurHash3, better for large sets (>1M elements) where collision resistance is critical
+
+Choose extended hash by setting `useExtendedHash = true` during initialization.
+
+## Testing
+
+Run the test suite:
+```bash
+nimble test
+```
\ No newline at end of file
diff --git a/nim-bloom/src/bloom.nim b/nim-bloom/src/bloom.nim
index 333ea7a..b9ae81e 100644
--- a/nim-bloom/src/bloom.nim
+++ b/nim-bloom/src/bloom.nim
@@ -3,11 +3,11 @@ import hashes
 import strutils
 import private/probabilities
 
-# Import MurmurHash3 code and compile at the same time as Nim code
+# Import MurmurHash3 code for large-scale use cases
 {.compile: "murmur3.c".}
 
 type
-  BloomFilterError = object of CatchableError
+  BloomFilterError* = object of CatchableError
   MurmurHashes = array[0..1, int]
   BloomFilter* = object
     capacity*: int
@@ -15,28 +15,27 @@ type
     kHashes*: int
     mBits*: int
     intArray: seq[int]
-    nBitsPerElem*: int
-    useMurmurHash*: bool
+    useExtendedHash*: bool  # Use 128-bit MurmurHash3 for very large filters
+
+{.push overflowChecks: off.}  # Turn off overflow checks for hashing operations
 
 proc rawMurmurHash(key: cstring, len: int, seed: uint32,
                      outHashes: var MurmurHashes): void {.
   importc: "MurmurHash3_x64_128".}
 
 proc murmurHash(key: string, seed = 0'u32): MurmurHashes =
-  rawMurmurHash(key, key.len, seed, outHashes = result)
-
-proc hashA(item: string, maxValue: int): int =
-  hash(item) mod maxValue
-
-proc hashB(item: string, maxValue: int): int =
-  hash(item & " b") mod maxValue
+  rawMurmurHash(key, key.len, seed, result)
 
 proc hashN(item: string, n: int, maxValue: int): int =
-  ## Get the nth hash of a string using the formula hashA + n * hashB
-  ## which uses 2 hash functions vs. k and has comparable properties
-  ## See Kirsch and Mitzenmacher, 2008:
+  ## Get the nth hash using Nim's built-in hash function using
+  ## the double hashing technique from Kirsch and Mitzenmacher, 2008:
   ## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
-  abs((hashA(item, maxValue) + n * hashB(item, maxValue))) mod maxValue
+  let
+    hashA = abs(hash(item)) mod maxValue  # Use abs to handle negative hashes
+    hashB = abs(hash(item & " b")) mod maxValue
+  abs((hashA + n * hashB)) mod maxValue
+
+{.pop.}  # Restore overflow checks
 
 proc getMOverNBitsForK(k: int, targetError: float,
     probabilityTable = kErrors): int =
@@ -54,28 +53,19 @@ proc getMOverNBitsForK(k: int, targetError: float,
 
 proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
                               forceNBitsPerElem = 0,
-                              useMurmurHash = true): BloomFilter =
+                              useExtendedHash = false): BloomFilter =
   ## Initializes a Bloom filter, using a specified ``capacity``,
   ## ``errorRate``, and – optionally – specific number of k hash functions.
   ## If ``kHashes`` is < 1 (default argument is 0), ``kHashes`` will be
-  ## optimally calculated on the fly. Otherwise, ``kHashes`` will be set to
-  ## the passed integer, which requires that ``forceNBitsPerElem`` is
-  ## also set to be greater than 0. Otherwise a ``BloomFilterError``
-  ## exception is raised.
-  ## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
-  ## useful tables on k and m/n (n bits per element) combinations.
-  ##
-  ## The Bloom filter uses the MurmurHash3 implementation by default,
-  ## though it can fall back to using the built-in nim ``hash`` function
-  ## if ``useMurmurHash = false``. This is compiled alongside the Nim
-  ## code using the ``{.compile.}`` pragma.
+  ## optimally calculated. If capacity > 1M elements, consider setting
+  ## useExtendedHash = true to use 128-bit MurmurHash3 for better 
+  ## collision resistance.
   var
     kHashes: int
-    bitsPerElem: float
     nBitsPerElem: int
 
   if k < 1: # Calculate optimal k and use that
-    bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
+    let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
     kHashes = round(ln(2.float) * bitsPerElem).int
     nBitsPerElem = round(bitsPerElem).int
   else: # Use specified k if possible
@@ -89,42 +79,42 @@ proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
     mBits = capacity * nBitsPerElem
     mInts = 1 + mBits div (sizeof(int) * 8)
 
-  BloomFilter(capacity: capacity, errorRate: errorRate, kHashes: kHashes,
-    mBits: mBits, intArray: newSeq[int](mInts), nBitsPerElem: nBitsPerElem,
-    useMurmurHash: useMurmurHash)
+  BloomFilter(
+    capacity: capacity,
+    errorRate: errorRate,
+    kHashes: kHashes,
+    mBits: mBits,
+    intArray: newSeq[int](mInts),
+    useExtendedHash: useExtendedHash
+  )
 
 proc `$`*(bf: BloomFilter): string =
   ## Prints the capacity, set error rate, number of k hash functions,
   ## and total bits of memory allocated by the Bloom filter.
-  "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits per stored element." %
+  "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." %
     [$bf.capacity,
      formatFloat(bf.errorRate, format = ffScientific, precision = 1),
-     $bf.kHashes, $bf.nBitsPerElem]
+     $bf.kHashes,
+     $(bf.mBits div bf.capacity)]
 
-{.push overflowChecks: off.}
+{.push overflowChecks: off.}  # Turn off overflow checks for hash computations
 
-proc hashMurmur(bf: BloomFilter, key: string): seq[int] =
-  result.newSeq(bf.kHashes)
-  let murmurHashes = murmurHash(key, seed = 0'u32)
-  for i in 0..<bf.kHashes:
-    result[i] = abs(murmurHashes[0] + i * murmurHashes[1]) mod bf.mBits
-
-{.pop.}
-
-proc hashNim(bf: BloomFilter, key: string): seq[int] =
-  result.newSeq(bf.kHashes)
-  for i in 0..<bf.kHashes:
-    result[i] = hashN(key, i, bf.mBits)
-
-proc hash(bf: BloomFilter, key: string): seq[int] =
-  if bf.useMurmurHash:
-    bf.hashMurmur(key)
+proc computeHashes(bf: BloomFilter, item: string): seq[int] =
+  var hashes = newSeq[int](bf.kHashes)
+  if bf.useExtendedHash:
+    let murmurHashes = murmurHash(item, 0'u32)
+    for i in 0..<bf.kHashes:
+      hashes[i] = abs((murmurHashes[0] + i.int64 * murmurHashes[1].int64).int) mod bf.mBits
   else:
-    bf.hashNim(key)
+    for i in 0..<bf.kHashes:
+      hashes[i] = hashN(item, i, bf.mBits)
+  hashes
+
+{.pop.}  # Restore overflow checks
 
 proc insert*(bf: var BloomFilter, item: string) =
   ## Insert an item (string) into the Bloom filter.
-  var hashSet = bf.hash(item)
+  let hashSet = bf.computeHashes(item)
   for h in hashSet:
     let
       intAddress = h div (sizeof(int) * 8)
@@ -136,7 +126,7 @@ proc lookup*(bf: BloomFilter, item: string): bool =
   ## If the item is present, ``lookup`` is guaranteed to return ``true``.
   ## If the item is not present, ``lookup`` will return ``false``
   ## with a probability 1 - ``bf.errorRate``.
-  var hashSet = bf.hash(item)
+  let hashSet = bf.computeHashes(item)
   for h in hashSet:
     let
       intAddress = h div (sizeof(int) * 8)
@@ -144,101 +134,4 @@ proc lookup*(bf: BloomFilter, item: string): bool =
       currentInt = bf.intArray[intAddress]
     if currentInt != (currentInt or (1 shl bitOffset)):
       return false
-  return true
-
-when isMainModule:
-  from random import rand, randomize
-  import times
-
-  # Test murmurhash 3
-  echo("Testing MurmurHash3 code...")
-  var hashOutputs: MurmurHashes
-  hashOutputs = [0, 0]
-  rawMurmurHash("hello", 5, 0, hashOutputs)
-  assert int(hashOutputs[0]) == -3758069500696749310 # Correct murmur outputs (cast to int64)
-  assert int(hashOutputs[1]) == 6565844092913065241
-
-  let hashOutputs2 = murmurHash("hello", 0)
-  assert hashOutputs2[0] == hashOutputs[0]
-  assert hashOutputs2[1] == hashOutputs[1]
-  let hashOutputs3 = murmurHash("hello", 10)
-  assert hashOutputs3[0] != hashOutputs[0]
-  assert hashOutputs3[1] != hashOutputs[1]
-
-  # Some quick and dirty tests (not complete)
-  var nElementsToTest = 100000
-  var bf = initializeBloomFilter(nElementsToTest, 0.001)
-  assert(bf of BloomFilter)
-  echo(bf)
-
-  var bf2 = initializeBloomFilter(10000, 0.001, k = 4,
-      forceNBitsPerElem = 20)
-  assert(bf2 of BloomFilter)
-  echo(bf2)
-
-  echo("Testing insertions and lookups...")
-  echo("Test element in BF2?: ", bf2.lookup("testing"))
-  echo("Inserting element.")
-  bf2.insert("testing")
-  echo("Test element in BF2?: ", bf2.lookup("testing"))
-  assert(bf2.lookup("testing"))
-
-  # Now test for speed with bf
-  randomize(2882) # Seed the RNG
-  var
-    sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
-    kTestElements, sampleLetters: seq[string]
-  kTestElements = newSeq[string](nElementsToTest)
-  sampleLetters = newSeq[string](62)
-
-  for i in 0..(nElementsToTest - 1):
-    var newString = ""
-    for j in 0..7:
-      newString.add(sampleChars[rand(51)])
-    kTestElements[i] = newString
-
-  var startTime, endTime: float
-  startTime = cpuTime()
-  for i in 0..(nElementsToTest - 1):
-    bf.insert(kTestElements[i])
-  endTime = cpuTime()
-  echo("Took ", formatFloat(endTime - startTime, format = ffDecimal,
-      precision = 4), " seconds to insert ", nElementsToTest, " items.")
-
-  var falsePositives = 0
-  for i in 0..(nElementsToTest - 1):
-    var falsePositiveString = ""
-    for j in 0..8: # By definition not in bf as 9 chars not 8
-      falsePositiveString.add(sampleChars[rand(51)])
-    if bf.lookup(falsePositiveString):
-      falsePositives += 1
-
-  echo("N false positives (of ", nElementsToTest, " lookups): ", falsePositives)
-  echo("False positive rate ", formatFloat(falsePositives / nElementsToTest,
-      format = ffDecimal, precision = 4))
-
-  var lookupErrors = 0
-  startTime = cpuTime()
-  for i in 0..(nElementsToTest - 1):
-    if not bf.lookup(kTestElements[i]):
-      lookupErrors += 1
-  endTime = cpuTime()
-  echo("Took ", formatFloat(endTime - startTime, format = ffDecimal,
-      precision = 4), " seconds to lookup ", nElementsToTest, " items.")
-
-  echo("N lookup errors (should be 0): ", lookupErrors)
-
-  # Finally test correct k / mOverN specification,
-  # first case raises an error, second works
-  try:
-    discard getMOverNBitsForK(k = 2, targetError = 0.00001)
-    assert false
-  except BloomFilterError:
-    assert true
-
-  assert getMOverNBitsForK(k = 2, targetError = 0.1) == 6
-  assert getMOverNBitsForK(k = 7, targetError = 0.01) == 10
-  assert getMOverNBitsForK(k = 7, targetError = 0.001) == 16
-
-  var bf3 = initializeBloomFilter(1000, 0.01, k = 4)
-  assert bf3.nBitsPerElem == 11
+  true
\ No newline at end of file
diff --git a/nim-bloom/tests/test.nim b/nim-bloom/tests/test.nim
index 53c7e35..76bdecc 100644
--- a/nim-bloom/tests/test.nim
+++ b/nim-bloom/tests/test.nim
@@ -4,7 +4,7 @@ from random import rand, randomize
 import times
 
 suite "murmur":
-  # Test murmurhash 3
+  # Test murmurhash 3 when enabled
   setup:
     var hashOutputs: MurmurHashes
     hashOutputs = [0, 0]
@@ -24,18 +24,85 @@ suite "murmur":
     check hashOutputs3[0] != hashOutputs[0]
     check hashOutputs3[1] != hashOutputs[1]
 
+suite "hashing comparison":
+  test "hash distribution":
+    const testSize = 10000
+    var standardCollisions = 0
+    var extendedCollisions = 0
+    
+    var bfStandard = initializeBloomFilter(testSize, 0.01, useExtendedHash = false)
+    var bfExtended = initializeBloomFilter(testSize, 0.01, useExtendedHash = true)
+    
+    # Generate test data
+    var testData = newSeq[string](testSize)
+    for i in 0..<testSize:
+      testData[i] = $i & "salt" & $rand(1000000)
+    
+    # Test standard hash
+    var startTime = cpuTime()
+    for item in testData:
+      bfStandard.insert(item)
+    let standardTime = cpuTime() - startTime
+    
+    # Test extended hash
+    startTime = cpuTime()
+    for item in testData:
+      bfExtended.insert(item)
+    let extendedTime = cpuTime() - startTime
+    
+    echo "Standard hash time: ", standardTime
+    echo "Extended hash time: ", extendedTime
+
+test "hash implementation switch":
+    # Create two filters with different hash implementations
+    let standardBf = initializeBloomFilter(1000, 0.01, useExtendedHash = false)
+    let murmurBf = initializeBloomFilter(1000, 0.01, useExtendedHash = true)
+    
+    # Insert same elements
+    let testData = ["test1", "test2", "test3", "test4", "test5"]
+    for item in testData:
+      var stdBf = standardBf  # Create mutable copies
+      var murBf = murmurBf
+      stdBf.insert(item)
+      murBf.insert(item)
+      
+      # Verify both can find their items
+      check stdBf.lookup(item)
+      check murBf.lookup(item)
+    
+    # Verify false positives work as expected for both
+    let nonExistentItem = "definitely-not-in-filter"
+    var falsePositiveStd = standardBf.lookup(nonExistentItem)
+    var falsePositiveMur = murmurBf.lookup(nonExistentItem)
+    
+    # Both should maintain their error rates
+    # Run multiple times to get a sample
+    var fpCountStd = 0
+    var fpCountMur = 0
+    for i in 0..1000:
+      let testItem = "test-" & $i
+      if standardBf.lookup(testItem): fpCountStd += 1
+      if murmurBf.lookup(testItem): fpCountMur += 1
+    
+    # Both should have similar false positive rates within reasonable bounds
+    let fpRateStd = fpCountStd.float / 1000.0
+    let fpRateMur = fpCountMur.float / 1000.0
+    
+    check abs(fpRateStd - fpRateMur) < 0.01  # Should be reasonably close
+    check fpRateStd < standardBf.errorRate * 1.5  # Should not exceed target error rate by too much
+    check fpRateMur < murmurBf.errorRate * 1.5
+
+    echo "Standard hash false positive rate: ", fpRateStd
+    echo "Murmur hash false positive rate: ", fpRateMur
 
 suite "bloom":
-
   setup:
     let nElementsToTest = 100000
     var bf = initializeBloomFilter(capacity = nElementsToTest, errorRate = 0.001)
     randomize(2882) # Seed the RNG
     var
       sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
-      kTestElements, sampleLetters: seq[string]
-    kTestElements = newSeq[string](nElementsToTest)
-    sampleLetters = newSeq[string](62)
+      kTestElements = newSeq[string](nElementsToTest)
 
     for i in 0..<nElementsToTest:
       var newString = ""
@@ -46,51 +113,83 @@ suite "bloom":
     for i in 0..<nElementsToTest:
       bf.insert(kTestElements[i])
 
-  test "params":
+  test "init parameters":
     check(bf.capacity == nElementsToTest)
     check(bf.errorRate == 0.001)
     check(bf.kHashes == 10)
-    check(bf.nBitsPerElem == 15)
     check(bf.mBits == 15 * nElementsToTest)
-    check(bf.useMurmurHash == true)
 
-  test "not hit":
+  test "hash mode selection":
+    let bf1 = initializeBloomFilter(100, 0.01)
+    check(bf1.useExtendedHash == false)
+    
+    let bf2 = initializeBloomFilter(100, 0.01, useExtendedHash = true)
+    check(bf2.useExtendedHash == true)
+
+  test "basic operations":
+    # Test empty lookup
     check(bf.lookup("nothing") == false)
+    
+    # Test insert and lookup
+    bf.insert("teststring")
+    check(bf.lookup("teststring") == true)
+    
+    # Test multiple inserts
+    bf.insert("test1")
+    bf.insert("test2")
+    check(bf.lookup("test1") == true)
+    check(bf.lookup("test2") == true)
+    check(bf.lookup("test3") == false)
 
-  test "hit":
-    bf.insert("hit")
-    check(bf.lookup("hit") == true)
-
-  test "force params":
-    var bf2 = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20)
-    check(bf2.capacity == 10000)
-    check(bf2.errorRate == 0.001)
-    check(bf2.kHashes == 4)
-    check(bf2.nBitsPerElem == 20)
-    check(bf2.mBits == 200000)
-    check(bf2.useMurmurHash == true)
+  test "large scale performance":
+    let largeSize = 1_000_000
+    var standardBf = initializeBloomFilter(largeSize, 0.001, useExtendedHash = false)
+    var extendedBf = initializeBloomFilter(largeSize, 0.001, useExtendedHash = true)
+    
+    var largeData = newSeq[string](1000)
+    for i in 0..<1000:
+      largeData[i] = $i & "test" & $rand(1000000)
+    
+    # Insert and measure false positives for both
+    var startTime = cpuTime()
+    for item in largeData:
+      standardBf.insert(item)
+    let standardTime = cpuTime() - startTime
+    
+    startTime = cpuTime()
+    for item in largeData:
+      extendedBf.insert(item)
+    let extendedTime = cpuTime() - startTime
+    
+    echo "Standard hash large insert time: ", standardTime
+    echo "Extended hash large insert time: ", extendedTime
 
   test "error rate":
     var falsePositives = 0
     for i in 0..<nElementsToTest:
       var falsePositiveString = ""
-      for j in 0..8: # By definition not in bf as 9 chars not 8
+      for j in 0..8:
         falsePositiveString.add(sampleChars[rand(51)])
       if bf.lookup(falsePositiveString):
         falsePositives += 1
 
-    check falsePositives / nElementsToTest < bf.errorRate
+    let actualErrorRate = falsePositives.float / nElementsToTest.float
+    check actualErrorRate < bf.errorRate
+    echo "Actual error rate: ", actualErrorRate
+    echo "Target error rate: ", bf.errorRate
 
-  test "lookup errors":
+  test "lookup reliability":
     var lookupErrors = 0
+    let startTime = cpuTime()
     for i in 0..<nElementsToTest:
       if not bf.lookup(kTestElements[i]):
         lookupErrors += 1
+    let endTime = cpuTime()
 
     check lookupErrors == 0
+    echo "Lookup time for ", nElementsToTest, " items: ", formatFloat(endTime - startTime, format = ffDecimal, precision = 4), " seconds"
 
-  # Finally test correct k / mOverN specification,
-  test "k/(m/n) spec":
+  test "k/(m/n) specification":
     expect(BloomFilterError):
       discard getMOverNBitsForK(k = 2, targetError = 0.00001)
 
@@ -98,5 +197,23 @@ suite "bloom":
     check getMOverNBitsForK(k = 7, targetError = 0.01) == 10
     check getMOverNBitsForK(k = 7, targetError = 0.001) == 16
 
-    var bf3 = initializeBloomFilter(1000, 0.01, k = 4)
-    check bf3.nBitsPerElem == 11
+  test "force params":
+    var bf2 = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20)
+    check(bf2.capacity == 10000)
+    check(bf2.errorRate == 0.001)
+    check(bf2.kHashes == 4)
+    check(bf2.mBits == 200000)
+
+  test "init error cases":
+    expect(BloomFilterError):
+      discard initializeBloomFilter(1000, 0.00001, k = 2)
+
+    expect(BloomFilterError):
+      discard initializeBloomFilter(1000, 0.00001, k = 13)
+
+  test "string representation":
+    let bf3 = initializeBloomFilter(1000, 0.01, k = 4)
+    let str = $bf3
+    check str.contains("1000")
+    check str.contains("4 hash functions")
+    check str.contains("1.0e-02")  # 0.01 in scientific notation
\ No newline at end of file