diff --git a/.gitignore b/.gitignore index 9bea433..cfc9510 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ - +nimcache +nimcache/* +tests/bloom +nim-bloom/bloom .DS_Store +src/.DS_Store \ No newline at end of file diff --git a/nim-bloom/src/bloom.nim b/bloom_filter/bloom.nim similarity index 65% rename from nim-bloom/src/bloom.nim rename to bloom_filter/bloom.nim index ba3bdc8..096dd29 100644 --- a/nim-bloom/src/bloom.nim +++ b/bloom_filter/bloom.nim @@ -3,47 +3,18 @@ import hashes import strutils import private/probabilities -# Import MurmurHash3 code with both 128-bit and 32-bit implementations -{.compile: "murmur3.c".} - type - HashType* = enum - htMurmur128, # Default: MurmurHash3_x64_128 - htMurmur32, # MurmurHash3_x86_32 - htNimHash # Nim's Hash (currently Farm Hash) - BloomFilterError* = object of CatchableError - MurmurHashes = array[0..1, int] - BloomFilter* = object capacity*: int errorRate*: float kHashes*: int mBits*: int intArray: seq[int] - hashType*: HashType {.push overflowChecks: off.} # Turn off overflow checks for hashing operations -proc rawMurmurHash128(key: cstring, len: int, seed: uint32, - outHashes: var MurmurHashes): void {. - importc: "MurmurHash3_x64_128".} - -proc rawMurmurHash32(key: cstring, len: int, seed: uint32, - outHashes: ptr uint32): void {. - importc: "MurmurHash3_x86_32".} - -proc murmurHash128(key: string, seed = 0'u32): MurmurHashes = - var hashResult: MurmurHashes - rawMurmurHash128(key, key.len, seed, hashResult) - hashResult - -proc murmurHash32(key: string, seed = 0'u32): uint32 = - var result: uint32 - rawMurmurHash32(key, key.len, seed, addr result) - result - proc hashN(item: string, n: int, maxValue: int): int = ## Get the nth hash using Nim's built-in hash function using ## the double hashing technique from Kirsch and Mitzenmacher, 2008: @@ -76,8 +47,7 @@ proc getMOverNBitsForK(k: int, targetError: float, "Specified value of k and error rate not achievable using less than 4 bytes / element.") proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0, - forceNBitsPerElem = 0, - hashType = htMurmur128): BloomFilter = + forceNBitsPerElem = 0): BloomFilter = ## Initializes a Bloom filter with specified parameters. ## ## Parameters: @@ -87,10 +57,6 @@ proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0, ## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for ## useful tables on k and m/n (n bits per element) combinations. ## - forceNBitsPerElem: Optional override for bits per element - ## - hashType: Choose hash function: - ## * htMurmur128: MurmurHash3_x64_128 (default) - recommended - ## * htMurmur32: MurmurHash3_x86_32 - ## * htNimHash: Nim's Hash var kHashes: int nBitsPerElem: int @@ -115,49 +81,23 @@ proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0, errorRate: errorRate, kHashes: kHashes, mBits: mBits, - intArray: newSeq[int](mInts), - hashType: hashType + intArray: newSeq[int](mInts) ) proc `$`*(bf: BloomFilter): string = ## Prints the configuration of the Bloom filter. - let hashType = case bf.hashType - of htMurmur128: "MurmurHash3_x64_128" - of htMurmur32: "MurmurHash3_x86_32" - of htNimHash: "NimHashHash" - - "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory. Using $5." % + "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." % [$bf.capacity, formatFloat(bf.errorRate, format = ffScientific, precision = 1), $bf.kHashes, - $(bf.mBits div bf.capacity), - hashType] - -{.push overflowChecks: off.} # Turn off overflow checks for hash computations + $(bf.mBits div bf.capacity)] proc computeHashes(bf: BloomFilter, item: string): seq[int] = var hashes = newSeq[int](bf.kHashes) - - case bf.hashType - of htMurmur128: - let murmurHashes = murmurHash128(item, 0'u32) - for i in 0..= 1.0.0" diff --git a/nim-bloom/src/.DS_Store b/nim-bloom/src/.DS_Store deleted file mode 100644 index 961a3c0..0000000 Binary files a/nim-bloom/src/.DS_Store and /dev/null differ diff --git a/nim-bloom/src/murmur3.c b/nim-bloom/src/murmur3.c deleted file mode 100644 index 3d1b689..0000000 --- a/nim-bloom/src/murmur3.c +++ /dev/null @@ -1,314 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -// Note - The x86 and x64 versions do _not_ produce the same results, as the -// algorithms are optimized for their respective platforms. You can still -// compile and run any of them on any platform, but your performance with the -// non-native version will be less than optimal. - -#include "murmur3.h" - -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -#ifdef __GNUC__ -#define FORCE_INLINE __attribute__((always_inline)) inline -#else -#define FORCE_INLINE -#endif - -static inline FORCE_INLINE uint32_t rotl32 ( uint32_t x, int8_t r ) -{ - return (x << r) | (x >> (32 - r)); -} - -static inline FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r ) -{ - return (x << r) | (x >> (64 - r)); -} - -#define ROTL32(x,y) rotl32(x,y) -#define ROTL64(x,y) rotl64(x,y) - -#define BIG_CONSTANT(x) (x##LLU) - -//----------------------------------------------------------------------------- -// Block read - if your platform needs to do endian-swapping or can only -// handle aligned reads, do the conversion here - -#define getblock(p, i) (p[i]) - -//----------------------------------------------------------------------------- -// Finalization mix - force all bits of a hash block to avalanche - -static inline FORCE_INLINE uint32_t fmix32 ( uint32_t h ) -{ - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - - return h; -} - -//---------- - -static inline FORCE_INLINE uint64_t fmix64 ( uint64_t k ) -{ - k ^= k >> 33; - k *= BIG_CONSTANT(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); - k ^= k >> 33; - - return k; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 ( const void * key, int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 4; - int i; - - uint32_t h1 = seed; - - uint32_t c1 = 0xcc9e2d51; - uint32_t c2 = 0x1b873593; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); - - for(i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i); - - k1 *= c1; - k1 = ROTL32(k1,15); - k1 *= c2; - - h1 ^= k1; - h1 = ROTL32(h1,13); - h1 = h1*5+0xe6546b64; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*4); - - uint32_t k1 = 0; - - switch(len & 3) - { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; - case 1: k1 ^= tail[0]; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; - - h1 = fmix32(h1); - - *(uint32_t*)out = h1; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_128 ( const void * key, const int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - int i; - - uint32_t h1 = seed; - uint32_t h2 = seed; - uint32_t h3 = seed; - uint32_t h4 = seed; - - uint32_t c1 = 0x239b961b; - uint32_t c2 = 0xab0e9789; - uint32_t c3 = 0x38b34ae5; - uint32_t c4 = 0xa1e38b93; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); - - for(i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i*4+0); - uint32_t k2 = getblock(blocks,i*4+1); - uint32_t k3 = getblock(blocks,i*4+2); - uint32_t k4 = getblock(blocks,i*4+3); - - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - - h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; - - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; - - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; - - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*16); - - uint32_t k1 = 0; - uint32_t k2 = 0; - uint32_t k3 = 0; - uint32_t k4 = 0; - - switch(len & 15) - { - case 15: k4 ^= tail[14] << 16; - case 14: k4 ^= tail[13] << 8; - case 13: k4 ^= tail[12] << 0; - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - case 12: k3 ^= tail[11] << 24; - case 11: k3 ^= tail[10] << 16; - case 10: k3 ^= tail[ 9] << 8; - case 9: k3 ^= tail[ 8] << 0; - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - case 8: k2 ^= tail[ 7] << 24; - case 7: k2 ^= tail[ 6] << 16; - case 6: k2 ^= tail[ 5] << 8; - case 5: k2 ^= tail[ 4] << 0; - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - case 4: k1 ^= tail[ 3] << 24; - case 3: k1 ^= tail[ 2] << 16; - case 2: k1 ^= tail[ 1] << 8; - case 1: k1 ^= tail[ 0] << 0; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - h1 = fmix32(h1); - h2 = fmix32(h2); - h3 = fmix32(h3); - h4 = fmix32(h4); - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - ((uint32_t*)out)[0] = h1; - ((uint32_t*)out)[1] = h2; - ((uint32_t*)out)[2] = h3; - ((uint32_t*)out)[3] = h4; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x64_128 ( const void * key, const int len, - const uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - int i; - - uint64_t h1 = seed; - uint64_t h2 = seed; - - uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); - uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); - - //---------- - // body - - const uint64_t * blocks = (const uint64_t *)(data); - - for(i = 0; i < nblocks; i++) - { - uint64_t k1 = getblock(blocks,i*2+0); - uint64_t k2 = getblock(blocks,i*2+1); - - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - - h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; - - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*16); - - uint64_t k1 = 0; - uint64_t k2 = 0; - - switch(len & 15) - { - case 15: k2 ^= (uint64_t)(tail[14]) << 48; - case 14: k2 ^= (uint64_t)(tail[13]) << 40; - case 13: k2 ^= (uint64_t)(tail[12]) << 32; - case 12: k2 ^= (uint64_t)(tail[11]) << 24; - case 11: k2 ^= (uint64_t)(tail[10]) << 16; - case 10: k2 ^= (uint64_t)(tail[ 9]) << 8; - case 9: k2 ^= (uint64_t)(tail[ 8]) << 0; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - case 8: k1 ^= (uint64_t)(tail[ 7]) << 56; - case 7: k1 ^= (uint64_t)(tail[ 6]) << 48; - case 6: k1 ^= (uint64_t)(tail[ 5]) << 40; - case 5: k1 ^= (uint64_t)(tail[ 4]) << 32; - case 4: k1 ^= (uint64_t)(tail[ 3]) << 24; - case 3: k1 ^= (uint64_t)(tail[ 2]) << 16; - case 2: k1 ^= (uint64_t)(tail[ 1]) << 8; - case 1: k1 ^= (uint64_t)(tail[ 0]) << 0; - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - ((uint64_t*)out)[0] = h1; - ((uint64_t*)out)[1] = h2; -} - -//----------------------------------------------------------------------------- diff --git a/nim-bloom/src/murmur3.h b/nim-bloom/src/murmur3.h deleted file mode 100644 index 6928384..0000000 --- a/nim-bloom/src/murmur3.h +++ /dev/null @@ -1,21 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the -// public domain. The author hereby disclaims copyright to this source -// code. - -#ifndef _MURMURHASH3_H_ -#define _MURMURHASH3_H_ - -#include - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 (const void *key, int len, uint32_t seed, void *out); - -void MurmurHash3_x86_128(const void *key, int len, uint32_t seed, void *out); - -void MurmurHash3_x64_128(const void *key, int len, uint32_t seed, void *out); - -//----------------------------------------------------------------------------- - -#endif // _MURMURHASH3_H_ \ No newline at end of file diff --git a/nim-bloom/tests/config.nims b/nim-bloom/tests/config.nims deleted file mode 100644 index 80091ff..0000000 --- a/nim-bloom/tests/config.nims +++ /dev/null @@ -1 +0,0 @@ -switch("path", "$projectDir/../src") diff --git a/nim-bloom/tests/test.nim b/tests/bloom.nim similarity index 53% rename from nim-bloom/tests/test.nim rename to tests/bloom.nim index 88d70fe..eb5da32 100644 --- a/nim-bloom/tests/test.nim +++ b/tests/bloom.nim @@ -1,84 +1,7 @@ import unittest -import strutils -include bloom +include ../bloom_filter/bloom from random import rand, randomize -suite "murmur": - # Test murmurhash3 implementations - setup: - var hashOutputs: MurmurHashes - hashOutputs = [0, 0] - rawMurmurHash128("hello", 5, 0'u32, hashOutputs) - - test "murmur128 raw": - check int(hashOutputs[0]) == -3758069500696749310 - check int(hashOutputs[1]) == 6565844092913065241 - - test "murmur128 wrapped": - let hashOutputs2 = murmurHash128("hello", 0'u32) - check hashOutputs2[0] == hashOutputs[0] - check hashOutputs2[1] == hashOutputs[1] - - test "murmur32": - let hash1 = murmurHash32("hello", 0'u32) - let hash2 = murmurHash32("hello", 0'u32) - check hash1 == hash2 # Same input should give same output - - let hash3 = murmurHash32("hello", 10'u32) - check hash1 != hash3 # Different seeds should give different outputs - -suite "hash quality": - test "hash type selection": - let bfMurmur128 = initializeBloomFilter(100, 0.01, hashType = htMurmur128) - let bfMurmur32 = initializeBloomFilter(100, 0.01, hashType = htMurmur32) - let bfNimHash = initializeBloomFilter(100, 0.01, hashType = htNimHash) - - check bfMurmur128.hashType == htMurmur128 - check bfMurmur32.hashType == htMurmur32 - check bfNimHash.hashType == htNimHash - - test "quality across hash types": - const testSize = 10_000 - let patterns = @[ - "shortstr", - repeat("a", 1000), # Very long string - "special@#$%^&*()", # Special characters - "unicode→★∑≈", # Unicode characters - repeat("pattern", 10) # Repeating pattern - ] - - for hashType in [htMurmur128, htMurmur32, htNimHash]: - var bf = initializeBloomFilter(testSize, 0.01, hashType = hashType) - var inserted = newSeq[string](testSize) - - # Test pattern handling - for pattern in patterns: - bf.insert(pattern) - check bf.lookup(pattern) - - # Test general insertion and lookup - for i in 0..