diff --git a/nim-bloom/.gitignore b/nim-bloom/.gitignore index a83eef1..f595a4e 100644 --- a/nim-bloom/.gitignore +++ b/nim-bloom/.gitignore @@ -1,6 +1,8 @@ nimcache nimcache/* tests/test +benches/bench +benches/bench_arch_end bloom *.html *.css diff --git a/nim-bloom/README.md b/nim-bloom/README.md index a62c04d..c255397 100644 --- a/nim-bloom/README.md +++ b/nim-bloom/README.md @@ -1,91 +1,117 @@ # nim-bloom -A high-performance Bloom filter implementation in Nim. Supports both Nim's built-in MurmurHash2 (default) and an optional 128-bit MurmurHash3 implementation for large-scale use cases. +A high-performance Bloom filter implementation in Nim offering standard and custom hash function options with different performance characteristics and false positive rates. ## Features - Fast string element insertion and lookup - Configurable error rates -- Choice between standard Nim hash (MurmurHash2) and extended 128-bit MurmurHash3 -- Optimized for both small and large-scale use cases -- Comprehensive test suite +- Choice between standard Nim hash and custom MurmurHash3 (128-bit or 32-bit) +- Optimized for supporting different use cases of speed and accuracy +- Comprehensive test suite and benchmarks -## Performance +## Usage -Historical benchmark using MurmurHash3 implementation on a 10-year-old Macbook Pro Retina: -- ~2.5M insertions/sec (~4.0 seconds for 10M insertions) -- ~2.9M lookups/sec (~3.5 seconds for 10M lookups) -- Test configuration: 0.001 error rate, Bloom filter size ~20-25MB -- Compiled with `-d:release` flag - -These numbers reflect performance outside of CPU cache, as the filter size was intentionally larger than L3 cache. Performance can be several million operations/sec higher with smaller filters that fit in cache. - -Current performance will vary based on: -- Choice of hash function (standard Nim hash vs extended MurmurHash3) -- Hardware specifications -- Data size and memory access patterns -- Compiler optimizations - -The default configuration (using Nim's built-in hash) is optimized for typical use cases, while the extended hash option (MurmurHash3) provides better collision resistance for large-scale applications at a slight performance cost. - -## Quickstart - -Basic usage: +Basic usage (defaults to MurmurHash3_128): ```nim -import bloom +import bloom2 -# Initialize with default hash (suitable for most uses) -var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.001) -echo bf # Print Bloom filter characteristics -echo bf.lookup("test") # false +# Initialize with default hash (MurmurHash3_128) +var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.01) + +# Or explicitly specify hash type +var bf32 = initializeBloomFilter( + capacity = 10000, + errorRate = 0.01, + hashType = htMurmur32 # Use 32-bit implementation +) + +# Basic operations bf.insert("test") -assert bf.lookup("test") # true - -# For large-scale usage (>1M elements), consider using extended hash -var largeBf = initializeBloomFilter( - capacity = 2_000_000, - errorRate = 0.001, - useExtendedHash = true -) +assert bf.lookup("test") ``` -## Advanced Configuration - -The Bloom filter can be configured in several ways: - -1. Default initialization (automatically calculates optimal parameters): -```nim -var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.001) -``` - -2. Specify custom number of hash functions: -```nim -var bf = initializeBloomFilter( - capacity = 10000, - errorRate = 0.001, - k = 5 # Use 5 hash functions instead of calculated optimal -) -``` - -3. Fully manual configuration: -```nim -var bf = initializeBloomFilter( - capacity = 10000, - errorRate = 0.001, - k = 5, - forceNBitsPerElem = 12, - useExtendedHash = false # Use standard hash (default) -) -``` - -Note: When specifying `k`, it must be ≤ 12 unless `forceNBitsPerElem` is also specified. The implementation will raise a `BloomFilterError` if parameters would result in suboptimal performance. - ## Hash Function Selection -- Default: Uses Nim's built-in hash (MurmurHash2), suitable for most use cases -- Extended: Uses 128-bit MurmurHash3, better for large sets (>1M elements) where collision resistance is critical +1. Use MurmurHash3_128 (default) when: + - You need the best balance of performance and accuracy + - Memory isn't severely constrained + - Working with large datasets + - False positive rates are important -Choose extended hash by setting `useExtendedHash = true` during initialization. +2. Use MurmurHash3_32 when: + - Running on 32-bit systems + - Memory is constrained + - Working with smaller datasets + - String concatenation overhead for second hash, causing higher insertion and lookup times, is acceptable. + +3. Use NimHash when: + - Consistency with Nim's default hashing is important + - Working with smaller datasets where performance is less critical + - Future availability of better hash functions or performant implementations + +Nim's Hash Implementation: + - Default (no flags): Uses FarmHash implementation + - With `-d:nimStringHash2`: Uses Nim's MurmurHash3_32 implementation + - Our implementation allows explicit choice regardless of compilation flags and our MurmurHash3_32 performs better because of directly using a native C Implementation + +## Performance Characteristics +### For 1M items - Random Strings +``` +Insertion Speed: +MurmurHash3_128: ~6.8M ops/sec +MurmurHash3_32: ~5.9M ops/sec +FarmHash: ~2.1M ops/sec + +False Positive Rates: +MurmurHash3_128: ~0.84% +MurmurHash3_32: ~0.83% +FarmHash: ~0.82% +``` + +These measurements show MurmurHash3_128's balanced performance profile, offering best speed and competitive false positive rates. + +Performance will vary based on: +- Choice of hash function +- Hardware specifications +- Data size and memory access patterns (inside vs outside cache) +- Compiler optimizations + +For detailed benchmarks across different data patterns and sizes, see [benches](benches/). + +## Implementation Details + +### Double Hashing Technique +This implmentation uses the Kirsch-Mitzenmacher method to generate k hash values from two initial hashes. The implementation varies by hash type: + +1. MurmurHash3_128: +```nim +h(i) = abs((hash1 + i * hash2) mod m) +``` +- Uses both 64-bit hashes from 128-bit output +- Natural double-hash implementation + +2. MurmurHash3_32: +```nim +let baseHash = murmurHash32(item, 0'u32) +let secondHash = murmurHash32(item & " b", 0'u32) +``` +- Uses string concatention by default for the second hash +- Bit Rotation for second hash provides sufficient randomness in some use cases while being much faster than string concatenation (but results in higher FP rate) +- Choose between bit rotation or string concatenation as per your use-case. + +3. Nim's Default Hash: +```nim + let + hashA = abs(hash(item)) mod maxValue + hashB = abs(hash(item & " b")) mod maxValue + h(i) = abs((hashA + n * hashB)) mod maxValue +``` +- Farm Hash or Nim's Murmur Hash based (if compliation flag is passed) +- Uses string concatention by default. +- Lower FP rate than bit rotation but comes at the cost of higher insertion and lookup times. + +*Tip:* Bit rotation values can be configurable as well. Use prime numbers for better mixing: 7, 11, 13, 17 for 32-bit; 21, 23, 27, 33 for 64-bit. Smaller rotations provides lesser mixing but as faster than higher rotations. ## Testing diff --git a/nim-bloom/benches/bench.nim b/nim-bloom/benches/bench.nim new file mode 100644 index 0000000..8fd9425 --- /dev/null +++ b/nim-bloom/benches/bench.nim @@ -0,0 +1,123 @@ +import times, random, strutils +include bloom + +type + DataPattern = enum + dpRandom, # Random strings + dpSequential, # Sequential numbers + dpFixed, # Fixed length strings + dpLong, # Long strings + dpSpecial # Strings with special characters + +type + BenchmarkResult = tuple[ + insertTime: float, + lookupTime: float, + falsePositives: int + ] + +proc generateBenchData(pattern: DataPattern, size: int, isLookupData: bool = false): seq[string] = + result = newSeq[string](size) + let offset = if isLookupData: size * 2 else: 0 # Ensure lookup data is well separated + + case pattern: + of dpRandom: + for i in 0.. 1M elements, consider setting - ## useExtendedHash = true to use 128-bit MurmurHash3 for better - ## collision resistance. + hashType = htMurmur128): BloomFilter = + ## Initializes a Bloom filter with specified parameters. + ## + ## Parameters: + ## - capacity: Expected number of elements to be inserted + ## - errorRate: Desired false positive rate (e.g., 0.01 for 1%) + ## - k: Optional number of hash functions. If 0, calculated optimally + ## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for + ## useful tables on k and m/n (n bits per element) combinations. + ## - forceNBitsPerElem: Optional override for bits per element + ## - hashType: Choose hash function: + ## * htMurmur128: MurmurHash3_x64_128 (default) - recommended + ## * htMurmur32: MurmurHash3_x86_32 + ## * htNimHash: Nim's Default Hash var kHashes: int nBitsPerElem: int @@ -85,29 +114,44 @@ proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0, kHashes: kHashes, mBits: mBits, intArray: newSeq[int](mInts), - useExtendedHash: useExtendedHash + hashType: hashType ) proc `$`*(bf: BloomFilter): string = - ## Prints the capacity, set error rate, number of k hash functions, - ## and total bits of memory allocated by the Bloom filter. - "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." % + ## Prints the configuration of the Bloom filter. + let hashType = case bf.hashType + of htMurmur128: "MurmurHash3_x64_128" + of htMurmur32: "MurmurHash3_x86_32" + of htNimHash: "NimHashHash" + + "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory. Using $5." % [$bf.capacity, formatFloat(bf.errorRate, format = ffScientific, precision = 1), $bf.kHashes, - $(bf.mBits div bf.capacity)] + $(bf.mBits div bf.capacity), + hashType] {.push overflowChecks: off.} # Turn off overflow checks for hash computations proc computeHashes(bf: BloomFilter, item: string): seq[int] = var hashes = newSeq[int](bf.kHashes) - if bf.useExtendedHash: - let murmurHashes = murmurHash(item, 0'u32) + + case bf.hashType + of htMurmur128: + let murmurHashes = murmurHash128(item, 0'u32) for i in 0..