From 8006af303ccc77b6a36640ee65a2401de9a91631 Mon Sep 17 00:00:00 2001 From: shash256 <111925100+shash256@users.noreply.github.com> Date: Sat, 9 Nov 2024 11:16:24 +0530 Subject: [PATCH] feat: std vs custom hash selection, add more tests chore: update readme and some fixes from review --- .gitignore | 2 + nim-bloom/.gitignore | 3 +- nim-bloom/README.md | 106 +++++++++++++++------ nim-bloom/src/bloom.nim | 197 +++++++++------------------------------ nim-bloom/tests/test.nim | 173 ++++++++++++++++++++++++++++------ 5 files changed, 274 insertions(+), 207 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9bea433 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ + +.DS_Store diff --git a/nim-bloom/.gitignore b/nim-bloom/.gitignore index 8d67039..a83eef1 100644 --- a/nim-bloom/.gitignore +++ b/nim-bloom/.gitignore @@ -4,4 +4,5 @@ tests/test bloom *.html *.css -/.DS_Store \ No newline at end of file +.DS_Store +src/.DS_Store \ No newline at end of file diff --git a/nim-bloom/README.md b/nim-bloom/README.md index 339228a..a62c04d 100644 --- a/nim-bloom/README.md +++ b/nim-bloom/README.md @@ -1,41 +1,95 @@ -nim-bloom -============ +# nim-bloom -Bloom filter implementation in Nim. Uses a C implementation of MurmurHash3 for optimal speed and numeric distribution. +A high-performance Bloom filter implementation in Nim. Supports both Nim's built-in MurmurHash2 (default) and an optional 128-bit MurmurHash3 implementation for large-scale use cases. -On a 10 year old Macbook Pro Retina the test case for 10M insertions executes in ~4.0 seconds and 10M lookups in ~3.5 seconds for a Bloom filter with a 1 in 1000 error rate (0.001). This is ~2.5M insertions/sec and ~2.9M lookups/sec on a single thread (but passing the `-d:release` flag to the Nim compiler and thus activating the C compiler's optimizations). If k is lowered to 5 or 6 vs. a larger "optimal" number, performance further increases to ~4M ops/sec. Note that this test is for a Bloom filter ~20-25MB in size and thus accurately reflects the cost of main memory accesses (vs. a smaller filter that might fit solely in L3 cache, for example, and can achieve several million additional ops/sec). +## Features +- Fast string element insertion and lookup +- Configurable error rates +- Choice between standard Nim hash (MurmurHash2) and extended 128-bit MurmurHash3 +- Optimized for both small and large-scale use cases +- Comprehensive test suite -Currently supports inserting and looking up string elements. Forthcoming features include: -* Support for other types beyond strings -* Support for iterables in the insert method -* Persistence +## Performance +Historical benchmark using MurmurHash3 implementation on a 10-year-old Macbook Pro Retina: +- ~2.5M insertions/sec (~4.0 seconds for 10M insertions) +- ~2.9M lookups/sec (~3.5 seconds for 10M lookups) +- Test configuration: 0.001 error rate, Bloom filter size ~20-25MB +- Compiled with `-d:release` flag -quickstart -==== -Quick functionality demo: -``` +These numbers reflect performance outside of CPU cache, as the filter size was intentionally larger than L3 cache. Performance can be several million operations/sec higher with smaller filters that fit in cache. + +Current performance will vary based on: +- Choice of hash function (standard Nim hash vs extended MurmurHash3) +- Hardware specifications +- Data size and memory access patterns +- Compiler optimizations + +The default configuration (using Nim's built-in hash) is optimized for typical use cases, while the extended hash option (MurmurHash3) provides better collision resistance for large-scale applications at a slight performance cost. + +## Quickstart + +Basic usage: +```nim import bloom + +# Initialize with default hash (suitable for most uses) var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.001) -echo bf # Get characteristics of the Bloom filter -echo bf.lookup("An element not in the Bloom filter") # Prints 'false' -bf.insert("Here we go...") -assert(bf.lookup("Here we go...")) +echo bf # Print Bloom filter characteristics +echo bf.lookup("test") # false +bf.insert("test") +assert bf.lookup("test") # true + +# For large-scale usage (>1M elements), consider using extended hash +var largeBf = initializeBloomFilter( + capacity = 2_000_000, + errorRate = 0.001, + useExtendedHash = true +) ``` +## Advanced Configuration -By default, the Bloom filter will use a mathematically optimal number of k hash functions, which minimizes the amount of error per bit of storage required. In many cases, however, it may be advantageous to specify a smaller value of k in order to save time hashing. This is supported by passing an explicit `k` parameter, which will then either create an optimal Bloom filter for the specified error rate.[1] +The Bloom filter can be configured in several ways: -[1] If `k` <= 12 and the number of required bytes per element is <= 4. If either of these conditions doesn't hold, a fully manual Bloom filter can be constructed by passing both `k` and `force_n_bits_per_elem`. - -Example: +1. Default initialization (automatically calculates optimal parameters): +```nim +var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.001) ``` -var bf2 = initializeBloomFilter(capacity = 10000, errorRate = 0.001, k = 5) -assert bf2.kHashes == 5 -assert bf2.nBitsPerElem == 18 -var bf3 = initializeBloomFilter(capacity = 10000, errorRate = 0.001, k = 5, forceNBitsPerElem = 12) -assert bf3.kHashes == 5 -assert bf3.nBitsPerElem == 12 # But note, however, that bf.errorRate will *not* be correct +2. Specify custom number of hash functions: +```nim +var bf = initializeBloomFilter( + capacity = 10000, + errorRate = 0.001, + k = 5 # Use 5 hash functions instead of calculated optimal +) ``` + +3. Fully manual configuration: +```nim +var bf = initializeBloomFilter( + capacity = 10000, + errorRate = 0.001, + k = 5, + forceNBitsPerElem = 12, + useExtendedHash = false # Use standard hash (default) +) +``` + +Note: When specifying `k`, it must be ≤ 12 unless `forceNBitsPerElem` is also specified. The implementation will raise a `BloomFilterError` if parameters would result in suboptimal performance. + +## Hash Function Selection + +- Default: Uses Nim's built-in hash (MurmurHash2), suitable for most use cases +- Extended: Uses 128-bit MurmurHash3, better for large sets (>1M elements) where collision resistance is critical + +Choose extended hash by setting `useExtendedHash = true` during initialization. + +## Testing + +Run the test suite: +```bash +nimble test +``` \ No newline at end of file diff --git a/nim-bloom/src/bloom.nim b/nim-bloom/src/bloom.nim index 333ea7a..b9ae81e 100644 --- a/nim-bloom/src/bloom.nim +++ b/nim-bloom/src/bloom.nim @@ -3,11 +3,11 @@ import hashes import strutils import private/probabilities -# Import MurmurHash3 code and compile at the same time as Nim code +# Import MurmurHash3 code for large-scale use cases {.compile: "murmur3.c".} type - BloomFilterError = object of CatchableError + BloomFilterError* = object of CatchableError MurmurHashes = array[0..1, int] BloomFilter* = object capacity*: int @@ -15,28 +15,27 @@ type kHashes*: int mBits*: int intArray: seq[int] - nBitsPerElem*: int - useMurmurHash*: bool + useExtendedHash*: bool # Use 128-bit MurmurHash3 for very large filters + +{.push overflowChecks: off.} # Turn off overflow checks for hashing operations proc rawMurmurHash(key: cstring, len: int, seed: uint32, outHashes: var MurmurHashes): void {. importc: "MurmurHash3_x64_128".} proc murmurHash(key: string, seed = 0'u32): MurmurHashes = - rawMurmurHash(key, key.len, seed, outHashes = result) - -proc hashA(item: string, maxValue: int): int = - hash(item) mod maxValue - -proc hashB(item: string, maxValue: int): int = - hash(item & " b") mod maxValue + rawMurmurHash(key, key.len, seed, result) proc hashN(item: string, n: int, maxValue: int): int = - ## Get the nth hash of a string using the formula hashA + n * hashB - ## which uses 2 hash functions vs. k and has comparable properties - ## See Kirsch and Mitzenmacher, 2008: + ## Get the nth hash using Nim's built-in hash function using + ## the double hashing technique from Kirsch and Mitzenmacher, 2008: ## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf - abs((hashA(item, maxValue) + n * hashB(item, maxValue))) mod maxValue + let + hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes + hashB = abs(hash(item & " b")) mod maxValue + abs((hashA + n * hashB)) mod maxValue + +{.pop.} # Restore overflow checks proc getMOverNBitsForK(k: int, targetError: float, probabilityTable = kErrors): int = @@ -54,28 +53,19 @@ proc getMOverNBitsForK(k: int, targetError: float, proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0, forceNBitsPerElem = 0, - useMurmurHash = true): BloomFilter = + useExtendedHash = false): BloomFilter = ## Initializes a Bloom filter, using a specified ``capacity``, ## ``errorRate``, and – optionally – specific number of k hash functions. ## If ``kHashes`` is < 1 (default argument is 0), ``kHashes`` will be - ## optimally calculated on the fly. Otherwise, ``kHashes`` will be set to - ## the passed integer, which requires that ``forceNBitsPerElem`` is - ## also set to be greater than 0. Otherwise a ``BloomFilterError`` - ## exception is raised. - ## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for - ## useful tables on k and m/n (n bits per element) combinations. - ## - ## The Bloom filter uses the MurmurHash3 implementation by default, - ## though it can fall back to using the built-in nim ``hash`` function - ## if ``useMurmurHash = false``. This is compiled alongside the Nim - ## code using the ``{.compile.}`` pragma. + ## optimally calculated. If capacity > 1M elements, consider setting + ## useExtendedHash = true to use 128-bit MurmurHash3 for better + ## collision resistance. var kHashes: int - bitsPerElem: float nBitsPerElem: int if k < 1: # Calculate optimal k and use that - bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2)))) + let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2)))) kHashes = round(ln(2.float) * bitsPerElem).int nBitsPerElem = round(bitsPerElem).int else: # Use specified k if possible @@ -89,42 +79,42 @@ proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0, mBits = capacity * nBitsPerElem mInts = 1 + mBits div (sizeof(int) * 8) - BloomFilter(capacity: capacity, errorRate: errorRate, kHashes: kHashes, - mBits: mBits, intArray: newSeq[int](mInts), nBitsPerElem: nBitsPerElem, - useMurmurHash: useMurmurHash) + BloomFilter( + capacity: capacity, + errorRate: errorRate, + kHashes: kHashes, + mBits: mBits, + intArray: newSeq[int](mInts), + useExtendedHash: useExtendedHash + ) proc `$`*(bf: BloomFilter): string = ## Prints the capacity, set error rate, number of k hash functions, ## and total bits of memory allocated by the Bloom filter. - "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits per stored element." % + "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." % [$bf.capacity, formatFloat(bf.errorRate, format = ffScientific, precision = 1), - $bf.kHashes, $bf.nBitsPerElem] + $bf.kHashes, + $(bf.mBits div bf.capacity)] -{.push overflowChecks: off.} +{.push overflowChecks: off.} # Turn off overflow checks for hash computations -proc hashMurmur(bf: BloomFilter, key: string): seq[int] = - result.newSeq(bf.kHashes) - let murmurHashes = murmurHash(key, seed = 0'u32) - for i in 0..