chore: updated bloom, fix in buffer sweep

This commit is contained in:
shash256 2025-01-13 16:12:13 +04:00
parent e16148638c
commit ecde0a9ea8
21 changed files with 432 additions and 954 deletions

BIN
nim-bloom/.DS_Store vendored

Binary file not shown.

View File

@ -1,58 +0,0 @@
name: website
on: [push] # debugging only
#on:
# push:
# tags:
# - 'v*.*.*'
jobs:
publish:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Set output
id: vars
run: echo ::set-output name=tag::${GITHUB_REF:10}
- name: Cache choosenim
id: cache-choosenim
uses: actions/cache@v1
with:
path: ~/.choosenim
key: ${{ runner.os }}-choosenim-stable
- name: Cache nimble
id: cache-nimble
uses: actions/cache@v1
with:
path: ~/.nimble
key: ${{ runner.os }}-nimble-stable
- uses: jiro4989/setup-nim-action@v1.0.2
with:
nim-version: 'stable'
- name: Build and test
env:
RELEASE_VERSION: ${{ steps.vars.outputs.tag }}
run: |
nimble test -Y
- name: Build doc
env:
RELEASE_VERSION: ${{ steps.vars.outputs.tag }}
run: |
# Due to bug https://github.com/nim-lang/Nim/issues/14281, compile the documentation separately.
nimble doc --git.url:https://github.com/$GITHUB_REPOSITORY --git.commit:$RELEASE_VERSION bloom.nim
nimble doc --git.url:https://github.com/$GITHUB_REPOSITORY --git.commit:$RELEASE_VERSION private/probabilities.nim
find .
mkdir -p ./public
mv bloom.html probabilities.html nimdoc.out.css ./public/
cd ./public/
ln -s ./bloom.html index.html
cd ../
- name: Deploy
if: success()
uses: crazy-max/ghaction-github-pages@v1.3.0
with:
target_branch: gh-pages
build_dir: ./public
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -1,6 +0,0 @@
nimcache
nimcache/*
tests/test
bloom
*.html
*.css

View File

@ -1,20 +0,0 @@
The MIT License (MIT)
Copyright (c) 2013 Nick Greenfield
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -1,41 +0,0 @@
nim-bloom
============
Bloom filter implementation in Nim. Uses a C implementation of MurmurHash3 for optimal speed and numeric distribution.
On a 10 year old Macbook Pro Retina the test case for 10M insertions executes in ~4.0 seconds and 10M lookups in ~3.5 seconds for a Bloom filter with a 1 in 1000 error rate (0.001). This is ~2.5M insertions/sec and ~2.9M lookups/sec on a single thread (but passing the `-d:release` flag to the Nim compiler and thus activating the C compiler's optimizations). If k is lowered to 5 or 6 vs. a larger "optimal" number, performance further increases to ~4M ops/sec. Note that this test is for a Bloom filter ~20-25MB in size and thus accurately reflects the cost of main memory accesses (vs. a smaller filter that might fit solely in L3 cache, for example, and can achieve several million additional ops/sec).
Currently supports inserting and looking up string elements. Forthcoming features include:
* Support for other types beyond strings
* Support for iterables in the insert method
* Persistence
quickstart
====
Quick functionality demo:
```
import bloom
var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.001)
echo bf # Get characteristics of the Bloom filter
echo bf.lookup("An element not in the Bloom filter") # Prints 'false'
bf.insert("Here we go...")
assert(bf.lookup("Here we go..."))
```
By default, the Bloom filter will use a mathematically optimal number of k hash functions, which minimizes the amount of error per bit of storage required. In many cases, however, it may be advantageous to specify a smaller value of k in order to save time hashing. This is supported by passing an explicit `k` parameter, which will then either create an optimal Bloom filter for the specified error rate.[1]
[1] If `k` <= 12 and the number of required bytes per element is <= 4. If either of these conditions doesn't hold, a fully manual Bloom filter can be constructed by passing both `k` and `force_n_bits_per_elem`.
Example:
```
var bf2 = initializeBloomFilter(capacity = 10000, errorRate = 0.001, k = 5)
assert bf2.kHashes == 5
assert bf2.nBitsPerElem == 18
var bf3 = initializeBloomFilter(capacity = 10000, errorRate = 0.001, k = 5, forceNBitsPerElem = 12)
assert bf3.kHashes == 5
assert bf3.nBitsPerElem == 12 # But note, however, that bf.errorRate will *not* be correct
```

View File

@ -1,9 +0,0 @@
# Package
version = "0.1.0"
author = "Boyd Greenfield"
description = "Efficient Bloom filter implementation for Nim using MurmurHash3."
license = "MIT"
srcDir = "src"
# Dependencies
requires "nim >= 1.0.0"

View File

@ -1,244 +0,0 @@
from math import ceil, ln, pow, round
import hashes
import strutils
import private/probabilities
# Import MurmurHash3 code and compile at the same time as Nim code
{.compile: "murmur3.c".}
type
BloomFilterError* = object of CatchableError
MurmurHashes = array[0..1, int]
BloomFilter* = object
capacity*: int
errorRate*: float
kHashes*: int
mBits*: int
intArray: seq[int]
nBitsPerElem*: int
useMurmurHash*: bool
proc rawMurmurHash(key: cstring, len: int, seed: uint32,
outHashes: var MurmurHashes): void {.
importc: "MurmurHash3_x64_128".}
proc murmurHash(key: string, seed = 0'u32): MurmurHashes =
rawMurmurHash(key, key.len, seed, outHashes = result)
proc hashA(item: string, maxValue: int): int =
hash(item) mod maxValue
proc hashB(item: string, maxValue: int): int =
hash(item & " b") mod maxValue
proc hashN(item: string, n: int, maxValue: int): int =
## Get the nth hash of a string using the formula hashA + n * hashB
## which uses 2 hash functions vs. k and has comparable properties
## See Kirsch and Mitzenmacher, 2008:
## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
abs((hashA(item, maxValue) + n * hashB(item, maxValue))) mod maxValue
proc getMOverNBitsForK(k: int, targetError: float,
probabilityTable = kErrors): int =
## Returns the optimal number of m/n bits for a given k.
if k notin 0..12:
raise newException(BloomFilterError,
"K must be <= 12 if forceNBitsPerElem is not also specified.")
for mOverN in 2..probabilityTable[k].high:
if probabilityTable[k][mOverN] < targetError:
return mOverN
raise newException(BloomFilterError,
"Specified value of k and error rate for which is not achievable using less than 4 bytes / element.")
proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
forceNBitsPerElem = 0,
useMurmurHash = true): BloomFilter =
## Initializes a Bloom filter, using a specified ``capacity``,
## ``errorRate``, and optionally specific number of k hash functions.
## If ``kHashes`` is < 1 (default argument is 0), ``kHashes`` will be
## optimally calculated on the fly. Otherwise, ``kHashes`` will be set to
## the passed integer, which requires that ``forceNBitsPerElem`` is
## also set to be greater than 0. Otherwise a ``BloomFilterError``
## exception is raised.
## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
## useful tables on k and m/n (n bits per element) combinations.
##
## The Bloom filter uses the MurmurHash3 implementation by default,
## though it can fall back to using the built-in nim ``hash`` function
## if ``useMurmurHash = false``. This is compiled alongside the Nim
## code using the ``{.compile.}`` pragma.
var
kHashes: int
bitsPerElem: float
nBitsPerElem: int
if k < 1: # Calculate optimal k and use that
bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
kHashes = round(ln(2.float) * bitsPerElem).int
nBitsPerElem = round(bitsPerElem).int
else: # Use specified k if possible
if forceNBitsPerElem < 1: # Use lookup table
nBitsPerElem = getMOverNBitsForK(k = k, targetError = errorRate)
else:
nBitsPerElem = forceNBitsPerElem
kHashes = k
let
mBits = capacity * nBitsPerElem
mInts = 1 + mBits div (sizeof(int) * 8)
BloomFilter(capacity: capacity, errorRate: errorRate, kHashes: kHashes,
mBits: mBits, intArray: newSeq[int](mInts), nBitsPerElem: nBitsPerElem,
useMurmurHash: useMurmurHash)
proc `$`*(bf: BloomFilter): string =
## Prints the capacity, set error rate, number of k hash functions,
## and total bits of memory allocated by the Bloom filter.
"Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits per stored element." %
[$bf.capacity,
formatFloat(bf.errorRate, format = ffScientific, precision = 1),
$bf.kHashes, $bf.nBitsPerElem]
{.push overflowChecks: off.}
proc hashMurmur(bf: BloomFilter, key: string): seq[int] =
result.newSeq(bf.kHashes)
let murmurHashes = murmurHash(key, seed = 0'u32)
for i in 0..<bf.kHashes:
result[i] = abs(murmurHashes[0] + i * murmurHashes[1]) mod bf.mBits
{.pop.}
proc hashNim(bf: BloomFilter, key: string): seq[int] =
result.newSeq(bf.kHashes)
for i in 0..<bf.kHashes:
result[i] = hashN(key, i, bf.mBits)
proc hash(bf: BloomFilter, key: string): seq[int] =
if bf.useMurmurHash:
bf.hashMurmur(key)
else:
bf.hashNim(key)
proc insert*(bf: var BloomFilter, item: string) =
## Insert an item (string) into the Bloom filter.
var hashSet = bf.hash(item)
for h in hashSet:
let
intAddress = h div (sizeof(int) * 8)
bitOffset = h mod (sizeof(int) * 8)
bf.intArray[intAddress] = bf.intArray[intAddress] or (1 shl bitOffset)
proc lookup*(bf: BloomFilter, item: string): bool =
## Lookup an item (string) into the Bloom filter.
## If the item is present, ``lookup`` is guaranteed to return ``true``.
## If the item is not present, ``lookup`` will return ``false``
## with a probability 1 - ``bf.errorRate``.
var hashSet = bf.hash(item)
for h in hashSet:
let
intAddress = h div (sizeof(int) * 8)
bitOffset = h mod (sizeof(int) * 8)
currentInt = bf.intArray[intAddress]
if currentInt != (currentInt or (1 shl bitOffset)):
return false
return true
when isMainModule:
from random import rand, randomize
import times
# Test murmurhash 3
echo("Testing MurmurHash3 code...")
var hashOutputs: MurmurHashes
hashOutputs = [0, 0]
rawMurmurHash("hello", 5, 0, hashOutputs)
assert int(hashOutputs[0]) == -3758069500696749310 # Correct murmur outputs (cast to int64)
assert int(hashOutputs[1]) == 6565844092913065241
let hashOutputs2 = murmurHash("hello", 0)
assert hashOutputs2[0] == hashOutputs[0]
assert hashOutputs2[1] == hashOutputs[1]
let hashOutputs3 = murmurHash("hello", 10)
assert hashOutputs3[0] != hashOutputs[0]
assert hashOutputs3[1] != hashOutputs[1]
# Some quick and dirty tests (not complete)
var nElementsToTest = 100000
var bf = initializeBloomFilter(nElementsToTest, 0.001)
assert(bf of BloomFilter)
echo(bf)
var bf2 = initializeBloomFilter(10000, 0.001, k = 4,
forceNBitsPerElem = 20)
assert(bf2 of BloomFilter)
echo(bf2)
echo("Testing insertions and lookups...")
echo("Test element in BF2?: ", bf2.lookup("testing"))
echo("Inserting element.")
bf2.insert("testing")
echo("Test element in BF2?: ", bf2.lookup("testing"))
assert(bf2.lookup("testing"))
# Now test for speed with bf
randomize(2882) # Seed the RNG
var
sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
kTestElements, sampleLetters: seq[string]
kTestElements = newSeq[string](nElementsToTest)
sampleLetters = newSeq[string](62)
for i in 0..(nElementsToTest - 1):
var newString = ""
for j in 0..7:
newString.add(sampleChars[rand(51)])
kTestElements[i] = newString
var startTime, endTime: float
startTime = cpuTime()
for i in 0..(nElementsToTest - 1):
bf.insert(kTestElements[i])
endTime = cpuTime()
echo("Took ", formatFloat(endTime - startTime, format = ffDecimal,
precision = 4), " seconds to insert ", nElementsToTest, " items.")
var falsePositives = 0
for i in 0..(nElementsToTest - 1):
var falsePositiveString = ""
for j in 0..8: # By definition not in bf as 9 chars not 8
falsePositiveString.add(sampleChars[rand(51)])
if bf.lookup(falsePositiveString):
falsePositives += 1
echo("N false positives (of ", nElementsToTest, " lookups): ", falsePositives)
echo("False positive rate ", formatFloat(falsePositives / nElementsToTest,
format = ffDecimal, precision = 4))
var lookupErrors = 0
startTime = cpuTime()
for i in 0..(nElementsToTest - 1):
if not bf.lookup(kTestElements[i]):
lookupErrors += 1
endTime = cpuTime()
echo("Took ", formatFloat(endTime - startTime, format = ffDecimal,
precision = 4), " seconds to lookup ", nElementsToTest, " items.")
echo("N lookup errors (should be 0): ", lookupErrors)
# Finally test correct k / mOverN specification,
# first case raises an error, second works
try:
discard getMOverNBitsForK(k = 2, targetError = 0.00001)
assert false
except BloomFilterError:
assert true
assert getMOverNBitsForK(k = 2, targetError = 0.1) == 6
assert getMOverNBitsForK(k = 7, targetError = 0.01) == 10
assert getMOverNBitsForK(k = 7, targetError = 0.001) == 16
var bf3 = initializeBloomFilter(1000, 0.01, k = 4)
assert bf3.nBitsPerElem == 11

View File

@ -1,314 +0,0 @@
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
// Note - The x86 and x64 versions do _not_ produce the same results, as the
// algorithms are optimized for their respective platforms. You can still
// compile and run any of them on any platform, but your performance with the
// non-native version will be less than optimal.
#include "murmur3.h"
//-----------------------------------------------------------------------------
// Platform-specific functions and macros
#ifdef __GNUC__
#define FORCE_INLINE __attribute__((always_inline)) inline
#else
#define FORCE_INLINE
#endif
static inline FORCE_INLINE uint32_t rotl32 ( uint32_t x, int8_t r )
{
return (x << r) | (x >> (32 - r));
}
static inline FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r )
{
return (x << r) | (x >> (64 - r));
}
#define ROTL32(x,y) rotl32(x,y)
#define ROTL64(x,y) rotl64(x,y)
#define BIG_CONSTANT(x) (x##LLU)
//-----------------------------------------------------------------------------
// Block read - if your platform needs to do endian-swapping or can only
// handle aligned reads, do the conversion here
#define getblock(p, i) (p[i])
//-----------------------------------------------------------------------------
// Finalization mix - force all bits of a hash block to avalanche
static inline FORCE_INLINE uint32_t fmix32 ( uint32_t h )
{
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
//----------
static inline FORCE_INLINE uint64_t fmix64 ( uint64_t k )
{
k ^= k >> 33;
k *= BIG_CONSTANT(0xff51afd7ed558ccd);
k ^= k >> 33;
k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
k ^= k >> 33;
return k;
}
//-----------------------------------------------------------------------------
void MurmurHash3_x86_32 ( const void * key, int len,
uint32_t seed, void * out )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 4;
int i;
uint32_t h1 = seed;
uint32_t c1 = 0xcc9e2d51;
uint32_t c2 = 0x1b873593;
//----------
// body
const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
for(i = -nblocks; i; i++)
{
uint32_t k1 = getblock(blocks,i);
k1 *= c1;
k1 = ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
uint32_t k1 = 0;
switch(len & 3)
{
case 3: k1 ^= tail[2] << 16;
case 2: k1 ^= tail[1] << 8;
case 1: k1 ^= tail[0];
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= len;
h1 = fmix32(h1);
*(uint32_t*)out = h1;
}
//-----------------------------------------------------------------------------
void MurmurHash3_x86_128 ( const void * key, const int len,
uint32_t seed, void * out )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;
int i;
uint32_t h1 = seed;
uint32_t h2 = seed;
uint32_t h3 = seed;
uint32_t h4 = seed;
uint32_t c1 = 0x239b961b;
uint32_t c2 = 0xab0e9789;
uint32_t c3 = 0x38b34ae5;
uint32_t c4 = 0xa1e38b93;
//----------
// body
const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
for(i = -nblocks; i; i++)
{
uint32_t k1 = getblock(blocks,i*4+0);
uint32_t k2 = getblock(blocks,i*4+1);
uint32_t k3 = getblock(blocks,i*4+2);
uint32_t k4 = getblock(blocks,i*4+3);
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
uint32_t k1 = 0;
uint32_t k2 = 0;
uint32_t k3 = 0;
uint32_t k4 = 0;
switch(len & 15)
{
case 15: k4 ^= tail[14] << 16;
case 14: k4 ^= tail[13] << 8;
case 13: k4 ^= tail[12] << 0;
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
case 12: k3 ^= tail[11] << 24;
case 11: k3 ^= tail[10] << 16;
case 10: k3 ^= tail[ 9] << 8;
case 9: k3 ^= tail[ 8] << 0;
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
case 8: k2 ^= tail[ 7] << 24;
case 7: k2 ^= tail[ 6] << 16;
case 6: k2 ^= tail[ 5] << 8;
case 5: k2 ^= tail[ 4] << 0;
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
case 4: k1 ^= tail[ 3] << 24;
case 3: k1 ^= tail[ 2] << 16;
case 2: k1 ^= tail[ 1] << 8;
case 1: k1 ^= tail[ 0] << 0;
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
h1 += h2; h1 += h3; h1 += h4;
h2 += h1; h3 += h1; h4 += h1;
h1 = fmix32(h1);
h2 = fmix32(h2);
h3 = fmix32(h3);
h4 = fmix32(h4);
h1 += h2; h1 += h3; h1 += h4;
h2 += h1; h3 += h1; h4 += h1;
((uint32_t*)out)[0] = h1;
((uint32_t*)out)[1] = h2;
((uint32_t*)out)[2] = h3;
((uint32_t*)out)[3] = h4;
}
//-----------------------------------------------------------------------------
void MurmurHash3_x64_128 ( const void * key, const int len,
const uint32_t seed, void * out )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;
int i;
uint64_t h1 = seed;
uint64_t h2 = seed;
uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
//----------
// body
const uint64_t * blocks = (const uint64_t *)(data);
for(i = 0; i < nblocks; i++)
{
uint64_t k1 = getblock(blocks,i*2+0);
uint64_t k2 = getblock(blocks,i*2+1);
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
uint64_t k1 = 0;
uint64_t k2 = 0;
switch(len & 15)
{
case 15: k2 ^= (uint64_t)(tail[14]) << 48;
case 14: k2 ^= (uint64_t)(tail[13]) << 40;
case 13: k2 ^= (uint64_t)(tail[12]) << 32;
case 12: k2 ^= (uint64_t)(tail[11]) << 24;
case 11: k2 ^= (uint64_t)(tail[10]) << 16;
case 10: k2 ^= (uint64_t)(tail[ 9]) << 8;
case 9: k2 ^= (uint64_t)(tail[ 8]) << 0;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
case 8: k1 ^= (uint64_t)(tail[ 7]) << 56;
case 7: k1 ^= (uint64_t)(tail[ 6]) << 48;
case 6: k1 ^= (uint64_t)(tail[ 5]) << 40;
case 5: k1 ^= (uint64_t)(tail[ 4]) << 32;
case 4: k1 ^= (uint64_t)(tail[ 3]) << 24;
case 3: k1 ^= (uint64_t)(tail[ 2]) << 16;
case 2: k1 ^= (uint64_t)(tail[ 1]) << 8;
case 1: k1 ^= (uint64_t)(tail[ 0]) << 0;
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= len; h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
h2 += h1;
((uint64_t*)out)[0] = h1;
((uint64_t*)out)[1] = h2;
}
//-----------------------------------------------------------------------------

View File

@ -1,21 +0,0 @@
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the
// public domain. The author hereby disclaims copyright to this source
// code.
#ifndef _MURMURHASH3_H_
#define _MURMURHASH3_H_
#include <stdint.h>
//-----------------------------------------------------------------------------
void MurmurHash3_x86_32 (const void *key, int len, uint32_t seed, void *out);
void MurmurHash3_x86_128(const void *key, int len, uint32_t seed, void *out);
void MurmurHash3_x64_128(const void *key, int len, uint32_t seed, void *out);
//-----------------------------------------------------------------------------
#endif // _MURMURHASH3_H_

View File

@ -1,103 +0,0 @@
#
# ### Probability table declaration, in private/ for readability ###
# Table for k hashes from 1..12 from http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html
# Iterate along the sequence at position [k] until the error rate is < specified, otherwise
# raise an error.
#
type
TErrorForK = seq[float]
TAllErrorRates* = array[0..12, TErrorForK]
var kErrors*: TAllErrorRates
kErrors[0] = @[1.0]
kErrors[1] = @[1.0, 1.0,
0.3930000000, 0.2830000000, 0.2210000000, 0.1810000000, 0.1540000000,
0.1330000000, 0.1180000000, 0.1050000000, 0.0952000000, 0.0869000000,
0.0800000000, 0.0740000000, 0.0689000000, 0.0645000000, 0.0606000000,
0.0571000000, 0.0540000000, 0.0513000000, 0.0488000000, 0.0465000000,
0.0444000000, 0.0425000000, 0.0408000000, 0.0392000000, 0.0377000000,
0.0364000000, 0.0351000000, 0.0339000000, 0.0328000000, 0.0317000000,
0.0308000000 ]
kErrors[2] = @[1.0, 1.0,
0.4000000000, 0.2370000000, 0.1550000000, 0.1090000000, 0.0804000000,
0.0618000000, 0.0489000000, 0.0397000000, 0.0329000000, 0.0276000000,
0.0236000000, 0.0203000000, 0.0177000000, 0.0156000000, 0.0138000000,
0.0123000000, 0.0111000000, 0.0099800000, 0.0090600000, 0.0082500000,
0.0075500000, 0.0069400000, 0.0063900000, 0.0059100000, 0.0054800000,
0.0051000000, 0.0047500000, 0.0044400000, 0.0041600000, 0.0039000000,
0.0036700000 ]
kErrors[3] = @[1.0, 1.0, 1.0,
0.2530000000, 0.1470000000, 0.0920000000, 0.0609000000, 0.0423000000,
0.0306000000, 0.0228000000, 0.0174000000, 0.0136000000, 0.0108000000,
0.0087500000, 0.0071800000, 0.0059600000, 0.0050000000, 0.0042300000,
0.0036200000, 0.0031200000, 0.0027000000, 0.0023600000, 0.0020700000,
0.0018300000, 0.0016200000, 0.0014500000, 0.0012900000, 0.0011600000,
0.0010500000, 0.0009490000, 0.0008620000, 0.0007850000, 0.0007170000 ]
kErrors[4] = @[1.0, 1.0, 1.0, 1.0,
0.1600000000, 0.0920000000, 0.0561000000, 0.0359000000, 0.0240000000,
0.0166000000, 0.0118000000, 0.0086400000, 0.0064600000, 0.0049200000,
0.0038100000, 0.0030000000, 0.0023900000, 0.0019300000, 0.0015800000,
0.0013000000, 0.0010800000, 0.0009050000, 0.0007640000, 0.0006490000,
0.0005550000, 0.0004780000, 0.0004130000, 0.0003590000, 0.0003140000,
0.0002760000, 0.0002430000, 0.0002150000, 0.0001910000 ]
kErrors[5] = @[1.0, 1.0, 1.0, 1.0, 1.0,
0.1010000000, 0.0578000000, 0.0347000000, 0.0217000000, 0.0141000000,
0.0094300000, 0.0065000000, 0.0045900000, 0.0033200000, 0.0024400000,
0.0018300000, 0.0013900000, 0.0010700000, 0.0008390000, 0.0006630000,
0.0005300000, 0.0004270000, 0.0003470000, 0.0002850000, 0.0002350000,
0.0001960000, 0.0001640000, 0.0001380000, 0.0001170000, 0.0000996000,
0.0000853000, 0.0000733000, 0.0000633000 ]
kErrors[6] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0638000000, 0.0364000000, 0.0216000000, 0.0133000000, 0.0084400000,
0.0055200000, 0.0037100000, 0.0025500000, 0.0017900000, 0.0012800000,
0.0009350000, 0.0006920000, 0.0005190000, 0.0003940000, 0.0003030000,
0.0002360000, 0.0001850000, 0.0001470000, 0.0001170000, 0.0000944000,
0.0000766000, 0.0000626000, 0.0000515000, 0.0000426000, 0.0000355000,
0.0000297000, 0.0000250000 ]
kErrors[7] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0229000000, 0.0135000000, 0.0081900000, 0.0051300000, 0.0032900000,
0.0021700000, 0.0014600000, 0.0010000000, 0.0007020000, 0.0004990000,
0.0003600000, 0.0002640000, 0.0001960000, 0.0001470000, 0.0001120000,
0.0000856000, 0.0000663000, 0.0000518000, 0.0000408000, 0.0000324000,
0.0000259000, 0.0000209000, 0.0000169000, 0.0000138000, 0.0000113000 ]
kErrors[8] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0145000000, 0.0084600000, 0.0050900000, 0.0031400000, 0.0019900000,
0.0012900000, 0.0008520000, 0.0005740000, 0.0003940000, 0.0002750000,
0.0001940000, 0.0001400000, 0.0001010000, 0.0000746000, 0.0000555000,
0.0000417000, 0.0000316000, 0.0000242000, 0.0000187000, 0.0000146000,
0.0000114000, 0.0000090100, 0.0000071600, 0.0000057300 ]
kErrors[9] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0053100000, 0.0031700000, 0.0019400000, 0.0012100000, 0.0007750000,
0.0005050000, 0.0003350000, 0.0002260000, 0.0001550000, 0.0001080000,
0.0000759000, 0.0000542000, 0.0000392000, 0.0000286000, 0.0000211000,
0.0000157000, 0.0000118000, 0.0000089600, 0.0000068500, 0.0000052800,
0.0000041000, 0.0000032000]
kErrors[10] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0033400000, 0.0019800000, 0.0012000000, 0.0007440000, 0.0004700000,
0.0003020000, 0.0001980000, 0.0001320000, 0.0000889000, 0.0000609000,
0.0000423000, 0.0000297000, 0.0000211000, 0.0000152000, 0.0000110000,
0.0000080700, 0.0000059700, 0.0000044500, 0.0000033500, 0.0000025400,
0.0000019400]
kErrors[11] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0021000000, 0.0012400000, 0.0007470000, 0.0004590000, 0.0002870000,
0.0001830000, 0.0001180000, 0.0000777000, 0.0000518000, 0.0000350000,
0.0000240000, 0.0000166000, 0.0000116000, 0.0000082300, 0.0000058900,
0.0000042500, 0.0000031000, 0.0000022800, 0.0000016900, 0.0000012600]
kErrors[12] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0007780000, 0.0004660000, 0.0002840000, 0.0001760000, 0.0001110000,
0.0000712000, 0.0000463000, 0.0000305000, 0.0000204000, 0.0000138000,
0.0000094200, 0.0000065200, 0.0000045600, 0.0000032200, 0.0000022900,
0.0000016500, 0.0000012000, 0.0000008740]

View File

@ -1 +0,0 @@
switch("path", "$projectDir/../src")

View File

@ -1,102 +0,0 @@
import unittest
include bloom
from random import rand, randomize
import times
suite "murmur":
# Test murmurhash 3
setup:
var hashOutputs: MurmurHashes
hashOutputs = [0, 0]
rawMurmurHash("hello", 5, 0, hashOutputs)
test "raw":
check int(hashOutputs[0]) == -3758069500696749310 # Correct murmur outputs (cast to int64)
check int(hashOutputs[1]) == 6565844092913065241
test "wrapped":
let hashOutputs2 = murmurHash("hello", 0)
check hashOutputs2[0] == hashOutputs[0]
check hashOutputs2[1] == hashOutputs[1]
test "seed":
let hashOutputs3 = murmurHash("hello", 10)
check hashOutputs3[0] != hashOutputs[0]
check hashOutputs3[1] != hashOutputs[1]
suite "bloom":
setup:
let nElementsToTest = 100000
var bf = initializeBloomFilter(capacity = nElementsToTest, errorRate = 0.001)
randomize(2882) # Seed the RNG
var
sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
kTestElements, sampleLetters: seq[string]
kTestElements = newSeq[string](nElementsToTest)
sampleLetters = newSeq[string](62)
for i in 0..<nElementsToTest:
var newString = ""
for j in 0..7:
newString.add(sampleChars[rand(51)])
kTestElements[i] = newString
for i in 0..<nElementsToTest:
bf.insert(kTestElements[i])
test "params":
check(bf.capacity == nElementsToTest)
check(bf.errorRate == 0.001)
check(bf.kHashes == 10)
check(bf.nBitsPerElem == 15)
check(bf.mBits == 15 * nElementsToTest)
check(bf.useMurmurHash == true)
test "not hit":
check(bf.lookup("nothing") == false)
test "hit":
bf.insert("hit")
check(bf.lookup("hit") == true)
test "force params":
var bf2 = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20)
check(bf2.capacity == 10000)
check(bf2.errorRate == 0.001)
check(bf2.kHashes == 4)
check(bf2.nBitsPerElem == 20)
check(bf2.mBits == 200000)
check(bf2.useMurmurHash == true)
test "error rate":
var falsePositives = 0
for i in 0..<nElementsToTest:
var falsePositiveString = ""
for j in 0..8: # By definition not in bf as 9 chars not 8
falsePositiveString.add(sampleChars[rand(51)])
if bf.lookup(falsePositiveString):
falsePositives += 1
check falsePositives / nElementsToTest < bf.errorRate
test "lookup errors":
var lookupErrors = 0
for i in 0..<nElementsToTest:
if not bf.lookup(kTestElements[i]):
lookupErrors += 1
check lookupErrors == 0
# Finally test correct k / mOverN specification,
test "k/(m/n) spec":
expect(BloomFilterError):
discard getMOverNBitsForK(k = 2, targetError = 0.00001)
check getMOverNBitsForK(k = 2, targetError = 0.1) == 6
check getMOverNBitsForK(k = 7, targetError = 0.01) == 10
check getMOverNBitsForK(k = 7, targetError = 0.001) == 16
var bf3 = initializeBloomFilter(1000, 0.01, k = 4)
check bf3.nBitsPerElem == 11

View File

@ -12,4 +12,5 @@ requires "libp2p"
# Tasks
task test, "Run the test suite":
exec "nim c -r tests/test_bloom.nim"
exec "nim c -r tests/test_reliability.nim"

123
src/bloom.nim Normal file
View File

@ -0,0 +1,123 @@
from math import ceil, ln, pow, round
import hashes
import strutils
import results
import private/probabilities
type
BloomFilter* = object
capacity*: int
errorRate*: float
kHashes*: int
mBits*: int
intArray*: seq[int]
{.push overflowChecks: off.} # Turn off overflow checks for hashing operations
proc hashN(item: string, n: int, maxValue: int): int =
## Get the nth hash using Nim's built-in hash function using
## the double hashing technique from Kirsch and Mitzenmacher, 2008:
## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
let
hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes
hashB = abs(hash(item & " b")) mod maxValue # string concatenation
abs((hashA + n * hashB)) mod maxValue
# # Use bit rotation for second hash instead of string concatenation if speed if preferred over FP-rate
# # Rotate left by 21 bits (lower the rotation, higher the speed but higher the FP-rate too)
# hashB = abs(
# ((h shl 21) or (h shr (sizeof(int) * 8 - 21)))
# ) mod maxValue
# abs((hashA + n.int64 * hashB)) mod maxValue
{.pop.}
proc getMOverNBitsForK*(k: int, targetError: float,
probabilityTable = kErrors): Result[int, string] =
## Returns the optimal number of m/n bits for a given k.
if k notin 0..12:
return err("K must be <= 12 if forceNBitsPerElem is not also specified.")
for mOverN in 2..probabilityTable[k].high:
if probabilityTable[k][mOverN] < targetError:
return ok(mOverN)
err("Specified value of k and error rate not achievable using less than 4 bytes / element.")
proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
forceNBitsPerElem = 0): Result[BloomFilter, string] =
## Initializes a Bloom filter with specified parameters.
##
## Parameters:
## - capacity: Expected number of elements to be inserted
## - errorRate: Desired false positive rate (e.g., 0.01 for 1%)
## - k: Optional number of hash functions. If 0, calculated optimally
## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
## useful tables on k and m/n (n bits per element) combinations.
## - forceNBitsPerElem: Optional override for bits per element
var
kHashes: int
nBitsPerElem: int
if k < 1: # Calculate optimal k and use that
let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
kHashes = round(ln(2.float) * bitsPerElem).int
nBitsPerElem = round(bitsPerElem).int
else: # Use specified k if possible
if forceNBitsPerElem < 1: # Use lookup table
let mOverNRes = getMOverNBitsForK(k = k, targetError = errorRate)
if mOverNRes.isErr:
return err(mOverNRes.error)
nBitsPerElem = mOverNRes.value
else:
nBitsPerElem = forceNBitsPerElem
kHashes = k
let
mBits = capacity * nBitsPerElem
mInts = 1 + mBits div (sizeof(int) * 8)
ok(BloomFilter(
capacity: capacity,
errorRate: errorRate,
kHashes: kHashes,
mBits: mBits,
intArray: newSeq[int](mInts)
))
proc `$`*(bf: BloomFilter): string =
## Prints the configuration of the Bloom filter.
"Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory." %
[$bf.capacity,
formatFloat(bf.errorRate, format = ffScientific, precision = 1),
$bf.kHashes,
$(bf.mBits div bf.capacity)]
proc computeHashes(bf: BloomFilter, item: string): seq[int] =
var hashes = newSeq[int](bf.kHashes)
for i in 0..<bf.kHashes:
hashes[i] = hashN(item, i, bf.mBits)
hashes
proc insert*(bf: var BloomFilter, item: string) =
## Insert an item (string) into the Bloom filter.
let hashSet = bf.computeHashes(item)
for h in hashSet:
let
intAddress = h div (sizeof(int) * 8)
bitOffset = h mod (sizeof(int) * 8)
bf.intArray[intAddress] = bf.intArray[intAddress] or (1 shl bitOffset)
proc lookup*(bf: BloomFilter, item: string): bool =
## Lookup an item (string) in the Bloom filter.
## If the item is present, ``lookup`` is guaranteed to return ``true``.
## If the item is not present, ``lookup`` will return ``false``
## with a probability 1 - ``bf.errorRate``.
let hashSet = bf.computeHashes(item)
for h in hashSet:
let
intAddress = h div (sizeof(int) * 8)
bitOffset = h mod (sizeof(int) * 8)
currentInt = bf.intArray[intAddress]
if currentInt != (currentInt or (1 shl bitOffset)):
return false
true

View File

@ -1,5 +1,5 @@
import std/[times, locks]
import "../nim-bloom/src/bloom"
import ./bloom
type
MessageID* = string

View File

@ -0,0 +1,100 @@
#
# ### Probability table declaration, in private/ for readability ###
# Table for k hashes from 1..12 from http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html
# Iterate along the sequence at position [k] until the error rate is < specified, otherwise
# raise an error.
#
type
TErrorForK = seq[float]
TAllErrorRates* = array[0..12, TErrorForK]
var kErrors* {.threadvar.}: TAllErrorRates
kErrors = [
@[1.0],
@[1.0, 1.0, 0.3930000000, 0.2830000000, 0.2210000000, 0.1810000000,
0.1540000000, 0.1330000000, 0.1180000000, 0.1050000000, 0.0952000000,
0.0869000000, 0.0800000000, 0.0740000000, 0.0689000000, 0.0645000000,
0.0606000000, 0.0571000000, 0.0540000000, 0.0513000000, 0.0488000000,
0.0465000000, 0.0444000000, 0.0425000000, 0.0408000000, 0.0392000000,
0.0377000000, 0.0364000000, 0.0351000000, 0.0339000000, 0.0328000000,
0.0317000000, 0.0308000000],
@[1.0, 1.0, 0.4000000000, 0.2370000000, 0.1550000000, 0.1090000000,
0.0804000000, 0.0618000000, 0.0489000000, 0.0397000000, 0.0329000000,
0.0276000000, 0.0236000000, 0.0203000000, 0.0177000000, 0.0156000000,
0.0138000000, 0.0123000000, 0.0111000000, 0.0099800000, 0.0090600000,
0.0082500000, 0.0075500000, 0.0069400000, 0.0063900000, 0.0059100000,
0.0054800000, 0.0051000000, 0.0047500000, 0.0044400000, 0.0041600000,
0.0039000000, 0.0036700000],
@[1.0, 1.0, 1.0, 0.2530000000, 0.1470000000, 0.0920000000, 0.0609000000,
0.0423000000, 0.0306000000, 0.0228000000, 0.0174000000, 0.0136000000,
0.0108000000, 0.0087500000, 0.0071800000, 0.0059600000, 0.0050000000,
0.0042300000, 0.0036200000, 0.0031200000, 0.0027000000, 0.0023600000,
0.0020700000, 0.0018300000, 0.0016200000, 0.0014500000, 0.0012900000,
0.0011600000, 0.0010500000, 0.0009490000, 0.0008620000, 0.0007850000,
0.0007170000],
@[1.0, 1.0, 1.0, 1.0, 0.1600000000, 0.0920000000, 0.0561000000, 0.0359000000,
0.0240000000, 0.0166000000, 0.0118000000, 0.0086400000, 0.0064600000,
0.0049200000, 0.0038100000, 0.0030000000, 0.0023900000, 0.0019300000,
0.0015800000, 0.0013000000, 0.0010800000, 0.0009050000, 0.0007640000,
0.0006490000, 0.0005550000, 0.0004780000, 0.0004130000, 0.0003590000,
0.0003140000, 0.0002760000, 0.0002430000, 0.0002150000, 0.0001910000],
@[1.0, 1.0, 1.0, 1.0, 1.0, 0.1010000000, 0.0578000000, 0.0347000000,
0.0217000000, 0.0141000000, 0.0094300000, 0.0065000000, 0.0045900000,
0.0033200000, 0.0024400000, 0.0018300000, 0.0013900000, 0.0010700000,
0.0008390000, 0.0006630000, 0.0005300000, 0.0004270000, 0.0003470000,
0.0002850000, 0.0002350000, 0.0001960000, 0.0001640000, 0.0001380000,
0.0001170000, 0.0000996000, 0.0000853000, 0.0000733000, 0.0000633000],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0638000000, 0.0364000000, 0.0216000000,
0.0133000000, 0.0084400000, 0.0055200000, 0.0037100000, 0.0025500000,
0.0017900000, 0.0012800000, 0.0009350000, 0.0006920000, 0.0005190000,
0.0003940000, 0.0003030000, 0.0002360000, 0.0001850000, 0.0001470000,
0.0001170000, 0.0000944000, 0.0000766000, 0.0000626000, 0.0000515000,
0.0000426000, 0.0000355000, 0.0000297000, 0.0000250000],
@[1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 0.0229000000, 0.0135000000, 0.0081900000,
0.0051300000, 0.0032900000, 0.0021700000, 0.0014600000, 0.0010000000,
0.0007020000, 0.0004990000, 0.0003600000, 0.0002640000, 0.0001960000,
0.0001470000, 0.0001120000, 0.0000856000, 0.0000663000, 0.0000518000,
0.0000408000, 0.0000324000, 0.0000259000, 0.0000209000, 0.0000169000,
0.0000138000, 0.0000113000],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 0.0145000000, 0.0084600000, 0.0050900000, 0.0031400000, 0.0019900000,
0.0012900000, 0.0008520000, 0.0005740000, 0.0003940000, 0.0002750000,
0.0001940000, 0.0001400000, 0.0001010000, 0.0000746000, 0.0000555000,
0.0000417000, 0.0000316000, 0.0000242000, 0.0000187000, 0.0000146000,
0.0000114000, 0.0000090100, 0.0000071600, 0.0000057300],
@[1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0053100000, 0.0031700000,
0.0019400000, 0.0012100000, 0.0007750000, 0.0005050000, 0.0003350000,
0.0002260000, 0.0001550000, 0.0001080000, 0.0000759000, 0.0000542000,
0.0000392000, 0.0000286000, 0.0000211000, 0.0000157000, 0.0000118000,
0.0000089600, 0.0000068500, 0.0000052800, 0.0000041000, 0.0000032000],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0033400000,
0.0019800000, 0.0012000000, 0.0007440000, 0.0004700000, 0.0003020000,
0.0001980000, 0.0001320000, 0.0000889000, 0.0000609000, 0.0000423000,
0.0000297000, 0.0000211000, 0.0000152000, 0.0000110000, 0.0000080700,
0.0000059700, 0.0000044500, 0.0000033500, 0.0000025400, 0.0000019400],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0021000000, 0.0012400000, 0.0007470000, 0.0004590000, 0.0002870000,
0.0001830000, 0.0001180000, 0.0000777000, 0.0000518000, 0.0000350000,
0.0000240000, 0.0000166000, 0.0000116000, 0.0000082300, 0.0000058900,
0.0000042500, 0.0000031000, 0.0000022800, 0.0000016900, 0.0000012600],
@[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
0.0007780000, 0.0004660000, 0.0002840000, 0.0001760000, 0.0001110000,
0.0000712000, 0.0000463000, 0.0000305000, 0.0000204000, 0.0000138000,
0.0000094200, 0.0000065200, 0.0000045600, 0.0000032200, 0.0000022900,
0.0000016500, 0.0000012000, 0.0000008740]
]

View File

@ -1,8 +1,8 @@
import ./protobufutil
import ./common
import ./bloom
import libp2p/protobuf/minprotobuf
import std/options
import "../nim-bloom/src/bloom"
proc toBytes(s: string): seq[byte] =
result = newSeq[byte](s.len)

View File

@ -327,20 +327,24 @@ proc checkUnacknowledgedMessages*(rm: ReliabilityManager) {.raises: [].} =
var newOutgoingBuffer: seq[UnacknowledgedMessage] = @[]
try:
for msg in rm.outgoingBuffer:
if (now - msg.sendTime) < rm.config.resendInterval:
newOutgoingBuffer.add(msg)
elif msg.resendAttempts < rm.config.maxResendAttempts:
var updatedMsg = msg
updatedMsg.resendAttempts += 1
updatedMsg.sendTime = now
newOutgoingBuffer.add(updatedMsg)
elif rm.onMessageSent != nil:
rm.onMessageSent(msg.message.messageId)
for unackMsg in rm.outgoingBuffer:
let elapsed = now - unackMsg.sendTime
if elapsed > rm.config.resendInterval:
# Time to attempt resend
if unackMsg.resendAttempts < rm.config.maxResendAttempts:
var updatedMsg = unackMsg
updatedMsg.resendAttempts += 1
updatedMsg.sendTime = now
newOutgoingBuffer.add(updatedMsg)
else:
if rm.onMessageSent != nil:
rm.onMessageSent(unackMsg.message.messageId)
else:
newOutgoingBuffer.add(unackMsg)
rm.outgoingBuffer = newOutgoingBuffer
except:
discard
except Exception as e:
logError("Error in checking unacknowledged messages: " & e.msg)
proc periodicBufferSweep(rm: ReliabilityManager) {.async: (raises: [CancelledError]).} =
## Periodically sweeps the buffer to clean up and check unacknowledged messages.
@ -351,7 +355,8 @@ proc periodicBufferSweep(rm: ReliabilityManager) {.async: (raises: [CancelledErr
rm.cleanBloomFilter()
except Exception as e:
logError("Error in periodic buffer sweep: " & e.msg)
await sleepAsync(chronos.seconds(rm.config.bufferSweepInterval.inSeconds))
await sleepAsync(chronos.milliseconds(rm.config.bufferSweepInterval.inMilliseconds))
proc periodicSyncMessage(rm: ReliabilityManager) {.async: (raises: [CancelledError]).} =
## Periodically notifies to send a sync message to maintain connectivity.

View File

@ -1,6 +1,6 @@
import std/[times, locks]
import chronos, chronicles
import "../nim-bloom/src/bloom"
import ./bloom
import ./common
proc logError*(msg: string) =
@ -11,25 +11,35 @@ proc logInfo*(msg: string) =
proc newRollingBloomFilter*(capacity: int, errorRate: float, window: times.Duration): RollingBloomFilter {.gcsafe.} =
try:
var filter: BloomFilter
var filterResult: Result[BloomFilter, string]
{.gcsafe.}:
filter = initializeBloomFilter(capacity, errorRate)
logInfo("Successfully initialized bloom filter")
RollingBloomFilter(
filter: filter,
window: window,
messages: @[]
)
filterResult = initializeBloomFilter(capacity, errorRate)
if filterResult.isOk:
logInfo("Successfully initialized bloom filter")
return RollingBloomFilter(
filter: filterResult.get(), # Extract the BloomFilter from Result
window: window,
messages: @[]
)
else:
logError("Failed to initialize bloom filter: " & filterResult.error)
# Fall through to default case below
except:
logError("Failed to initialize bloom filter")
var filter: BloomFilter
{.gcsafe.}:
filter = initializeBloomFilter(DefaultBloomFilterCapacity, DefaultBloomFilterErrorRate)
RollingBloomFilter(
filter: filter,
# Default fallback case
let defaultResult = initializeBloomFilter(DefaultBloomFilterCapacity, DefaultBloomFilterErrorRate)
if defaultResult.isOk:
return RollingBloomFilter(
filter: defaultResult.get(),
window: window,
messages: @[]
)
else:
# If even default initialization fails, raise an exception
logError("Failed to initialize bloom filter with default parameters")
proc add*(rbf: var RollingBloomFilter, messageId: MessageID) {.gcsafe.} =
## Adds a message ID to the rolling bloom filter.
@ -54,9 +64,14 @@ proc clean*(rbf: var RollingBloomFilter) {.gcsafe.} =
let now = getTime()
let cutoff = now - rbf.window
var newMessages: seq[TimestampedMessageID] = @[]
var newFilter: BloomFilter
{.gcsafe.}:
newFilter = initializeBloomFilter(rbf.filter.capacity, rbf.filter.errorRate)
# Initialize new filter
let newFilterResult = initializeBloomFilter(rbf.filter.capacity, rbf.filter.errorRate)
if newFilterResult.isErr:
logError("Failed to create new bloom filter: " & newFilterResult.error)
return
var newFilter = newFilterResult.get()
for msg in rbf.messages:
if msg.timestamp > cutoff:

142
tests/test_bloom.nim Normal file
View File

@ -0,0 +1,142 @@
import unittest, results, strutils
import ../src/bloom
from random import rand, randomize
suite "bloom filter":
setup:
let nElementsToTest = 10000
let bfResult = initializeBloomFilter(capacity = nElementsToTest, errorRate = 0.001)
check bfResult.isOk
var bf = bfResult.get
randomize(2882) # Seed the RNG
var
sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
testElements = newSeq[string](nElementsToTest)
for i in 0..<nElementsToTest:
var newString = ""
for j in 0..7:
newString.add(sampleChars[rand(51)])
testElements[i] = newString
for item in testElements:
bf.insert(item)
test "initialization parameters":
check bf.capacity == nElementsToTest
check bf.errorRate == 0.001
check bf.kHashes == 10
check bf.mBits div bf.capacity == 15 # bits per element
test "basic operations":
check bf.lookup("nonexistent") == false # Test empty lookup
let bf2Result = initializeBloomFilter(100, 0.01)
check bf2Result.isOk
var bf2 = bf2Result.get
bf2.insert("test string")
check bf2.lookup("test string") == true
check bf2.lookup("different string") == false
test "error rate":
var falsePositives = 0
let testSize = nElementsToTest div 2
for i in 0..<testSize:
var testString = ""
for j in 0..8: # Different length than setup
testString.add(sampleChars[rand(51)])
if bf.lookup(testString):
falsePositives.inc()
let actualErrorRate = falsePositives.float / testSize.float
check actualErrorRate < bf.errorRate * 1.5 # Allow some margin
test "perfect recall":
var lookupErrors = 0
for item in testElements:
if not bf.lookup(item):
lookupErrors.inc()
check lookupErrors == 0
test "k/m bits specification":
# Test error case for k > 12
let errorCase = getMOverNBitsForK(k = 13, targetError = 0.01)
check errorCase.isErr
check errorCase.error == "K must be <= 12 if forceNBitsPerElem is not also specified."
# Test error case for unachievable error rate
let errorCase2 = getMOverNBitsForK(k = 2, targetError = 0.00001)
check errorCase2.isErr
check errorCase2.error == "Specified value of k and error rate not achievable using less than 4 bytes / element."
# Test success cases
let case1 = getMOverNBitsForK(k = 2, targetError = 0.1)
check case1.isOk
check case1.value == 6
let case2 = getMOverNBitsForK(k = 7, targetError = 0.01)
check case2.isOk
check case2.value == 10
let case3 = getMOverNBitsForK(k = 7, targetError = 0.001)
check case3.isOk
check case3.value == 16
let bf2Result = initializeBloomFilter(10000, 0.001, k = 4, forceNBitsPerElem = 20)
check bf2Result.isOk
let bf2 = bf2Result.get
check bf2.kHashes == 4
check bf2.mBits == 200000
test "string representation":
let bf3Result = initializeBloomFilter(1000, 0.01, k = 4)
check bf3Result.isOk
let bf3 = bf3Result.get
let str = $bf3
check str.contains("1000") # Capacity
check str.contains("4 hash") # Hash functions
check str.contains("1.0e-02") # Error rate in scientific notation
suite "bloom filter special cases":
test "different patterns of strings":
const testSize = 10_000
let patterns = @[
"shortstr",
repeat("a", 1000), # Very long string
"special@#$%^&*()", # Special characters
"unicode→★∑≈", # Unicode characters
repeat("pattern", 10) # Repeating pattern
]
let bfResult = initializeBloomFilter(testSize, 0.01)
check bfResult.isOk
var bf = bfResult.get
var inserted = newSeq[string](testSize)
# Test pattern handling
for pattern in patterns:
bf.insert(pattern)
assert bf.lookup(pattern), "failed lookup pattern: " & pattern
# Test general insertion and lookup
for i in 0..<testSize:
inserted[i] = $i & "test" & $rand(1000)
bf.insert(inserted[i])
# Verify all insertions
var lookupErrors = 0
for item in inserted:
if not bf.lookup(item):
lookupErrors.inc()
check lookupErrors == 0
# Check false positive rate
var falsePositives = 0
let fpTestSize = testSize div 2
for i in 0..<fpTestSize:
let testItem = "notpresent" & $i & $rand(1000)
if bf.lookup(testItem):
falsePositives.inc()
let fpRate = falsePositives.float / fpTestSize.float
check fpRate < bf.errorRate * 1.5 # Allow some margin but should be close to target

View File

@ -1,4 +1,4 @@
import unittest, results, chronos
import unittest, results, chronos, std/times
import ../src/reliability
import ../src/common
import ../src/protobuf
@ -299,6 +299,14 @@ suite "Periodic Tasks & Buffer Management":
test "periodic buffer sweep":
var messageSentCount = 0
var config = defaultConfig()
config.resendInterval = initDuration(milliseconds = 100) # Very short for testing
config.bufferSweepInterval = initDuration(milliseconds = 50)
let rmResultP = newReliabilityManager("testChannel", config)
check rmResultP.isOk()
let rm = rmResultP.get()
rm.setCallbacks(
proc(messageId: MessageID) {.gcsafe.} = discard,
proc(messageId: MessageID) {.gcsafe.} = messageSentCount += 1,
@ -315,13 +323,16 @@ suite "Periodic Tasks & Buffer Management":
check initialBuffer[0].resendAttempts == 0
rm.startPeriodicTasks()
waitFor sleepAsync(chronos.seconds(6))
# Wait long enough for several sweep intervals
waitFor sleepAsync(chronos.milliseconds(300))
let finalBuffer = rm.getOutgoingBuffer()
check:
finalBuffer.len == 1
finalBuffer[0].resendAttempts > 0
rm.cleanup()
test "periodic sync":
var syncCallCount = 0
rm.setCallbacks(