nim-sds/nim-bloom/src/bloom.nim
2024-11-29 14:07:24 +04:00

245 lines
8.5 KiB
Nim
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from math import ceil, ln, pow, round
import hashes
import strutils
import private/probabilities
# Import MurmurHash3 code and compile at the same time as Nim code
{.compile: "murmur3.c".}
type
BloomFilterError* = object of CatchableError
MurmurHashes = array[0..1, int]
BloomFilter* = object
capacity*: int
errorRate*: float
kHashes*: int
mBits*: int
intArray: seq[int]
nBitsPerElem*: int
useMurmurHash*: bool
proc rawMurmurHash(key: cstring, len: int, seed: uint32,
outHashes: var MurmurHashes): void {.
importc: "MurmurHash3_x64_128".}
proc murmurHash(key: string, seed = 0'u32): MurmurHashes =
rawMurmurHash(key, key.len, seed, outHashes = result)
proc hashA(item: string, maxValue: int): int =
hash(item) mod maxValue
proc hashB(item: string, maxValue: int): int =
hash(item & " b") mod maxValue
proc hashN(item: string, n: int, maxValue: int): int =
## Get the nth hash of a string using the formula hashA + n * hashB
## which uses 2 hash functions vs. k and has comparable properties
## See Kirsch and Mitzenmacher, 2008:
## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
abs((hashA(item, maxValue) + n * hashB(item, maxValue))) mod maxValue
proc getMOverNBitsForK(k: int, targetError: float,
probabilityTable = kErrors): int =
## Returns the optimal number of m/n bits for a given k.
if k notin 0..12:
raise newException(BloomFilterError,
"K must be <= 12 if forceNBitsPerElem is not also specified.")
for mOverN in 2..probabilityTable[k].high:
if probabilityTable[k][mOverN] < targetError:
return mOverN
raise newException(BloomFilterError,
"Specified value of k and error rate for which is not achievable using less than 4 bytes / element.")
proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0,
forceNBitsPerElem = 0,
useMurmurHash = true): BloomFilter =
## Initializes a Bloom filter, using a specified ``capacity``,
## ``errorRate``, and optionally specific number of k hash functions.
## If ``kHashes`` is < 1 (default argument is 0), ``kHashes`` will be
## optimally calculated on the fly. Otherwise, ``kHashes`` will be set to
## the passed integer, which requires that ``forceNBitsPerElem`` is
## also set to be greater than 0. Otherwise a ``BloomFilterError``
## exception is raised.
## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
## useful tables on k and m/n (n bits per element) combinations.
##
## The Bloom filter uses the MurmurHash3 implementation by default,
## though it can fall back to using the built-in nim ``hash`` function
## if ``useMurmurHash = false``. This is compiled alongside the Nim
## code using the ``{.compile.}`` pragma.
var
kHashes: int
bitsPerElem: float
nBitsPerElem: int
if k < 1: # Calculate optimal k and use that
bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2))))
kHashes = round(ln(2.float) * bitsPerElem).int
nBitsPerElem = round(bitsPerElem).int
else: # Use specified k if possible
if forceNBitsPerElem < 1: # Use lookup table
nBitsPerElem = getMOverNBitsForK(k = k, targetError = errorRate)
else:
nBitsPerElem = forceNBitsPerElem
kHashes = k
let
mBits = capacity * nBitsPerElem
mInts = 1 + mBits div (sizeof(int) * 8)
BloomFilter(capacity: capacity, errorRate: errorRate, kHashes: kHashes,
mBits: mBits, intArray: newSeq[int](mInts), nBitsPerElem: nBitsPerElem,
useMurmurHash: useMurmurHash)
proc `$`*(bf: BloomFilter): string =
## Prints the capacity, set error rate, number of k hash functions,
## and total bits of memory allocated by the Bloom filter.
"Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits per stored element." %
[$bf.capacity,
formatFloat(bf.errorRate, format = ffScientific, precision = 1),
$bf.kHashes, $bf.nBitsPerElem]
{.push overflowChecks: off.}
proc hashMurmur(bf: BloomFilter, key: string): seq[int] =
result.newSeq(bf.kHashes)
let murmurHashes = murmurHash(key, seed = 0'u32)
for i in 0..<bf.kHashes:
result[i] = abs(murmurHashes[0] + i * murmurHashes[1]) mod bf.mBits
{.pop.}
proc hashNim(bf: BloomFilter, key: string): seq[int] =
result.newSeq(bf.kHashes)
for i in 0..<bf.kHashes:
result[i] = hashN(key, i, bf.mBits)
proc hash(bf: BloomFilter, key: string): seq[int] =
if bf.useMurmurHash:
bf.hashMurmur(key)
else:
bf.hashNim(key)
proc insert*(bf: var BloomFilter, item: string) =
## Insert an item (string) into the Bloom filter.
var hashSet = bf.hash(item)
for h in hashSet:
let
intAddress = h div (sizeof(int) * 8)
bitOffset = h mod (sizeof(int) * 8)
bf.intArray[intAddress] = bf.intArray[intAddress] or (1 shl bitOffset)
proc lookup*(bf: BloomFilter, item: string): bool =
## Lookup an item (string) into the Bloom filter.
## If the item is present, ``lookup`` is guaranteed to return ``true``.
## If the item is not present, ``lookup`` will return ``false``
## with a probability 1 - ``bf.errorRate``.
var hashSet = bf.hash(item)
for h in hashSet:
let
intAddress = h div (sizeof(int) * 8)
bitOffset = h mod (sizeof(int) * 8)
currentInt = bf.intArray[intAddress]
if currentInt != (currentInt or (1 shl bitOffset)):
return false
return true
when isMainModule:
from random import rand, randomize
import times
# Test murmurhash 3
echo("Testing MurmurHash3 code...")
var hashOutputs: MurmurHashes
hashOutputs = [0, 0]
rawMurmurHash("hello", 5, 0, hashOutputs)
assert int(hashOutputs[0]) == -3758069500696749310 # Correct murmur outputs (cast to int64)
assert int(hashOutputs[1]) == 6565844092913065241
let hashOutputs2 = murmurHash("hello", 0)
assert hashOutputs2[0] == hashOutputs[0]
assert hashOutputs2[1] == hashOutputs[1]
let hashOutputs3 = murmurHash("hello", 10)
assert hashOutputs3[0] != hashOutputs[0]
assert hashOutputs3[1] != hashOutputs[1]
# Some quick and dirty tests (not complete)
var nElementsToTest = 100000
var bf = initializeBloomFilter(nElementsToTest, 0.001)
assert(bf of BloomFilter)
echo(bf)
var bf2 = initializeBloomFilter(10000, 0.001, k = 4,
forceNBitsPerElem = 20)
assert(bf2 of BloomFilter)
echo(bf2)
echo("Testing insertions and lookups...")
echo("Test element in BF2?: ", bf2.lookup("testing"))
echo("Inserting element.")
bf2.insert("testing")
echo("Test element in BF2?: ", bf2.lookup("testing"))
assert(bf2.lookup("testing"))
# Now test for speed with bf
randomize(2882) # Seed the RNG
var
sampleChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
kTestElements, sampleLetters: seq[string]
kTestElements = newSeq[string](nElementsToTest)
sampleLetters = newSeq[string](62)
for i in 0..(nElementsToTest - 1):
var newString = ""
for j in 0..7:
newString.add(sampleChars[rand(51)])
kTestElements[i] = newString
var startTime, endTime: float
startTime = cpuTime()
for i in 0..(nElementsToTest - 1):
bf.insert(kTestElements[i])
endTime = cpuTime()
echo("Took ", formatFloat(endTime - startTime, format = ffDecimal,
precision = 4), " seconds to insert ", nElementsToTest, " items.")
var falsePositives = 0
for i in 0..(nElementsToTest - 1):
var falsePositiveString = ""
for j in 0..8: # By definition not in bf as 9 chars not 8
falsePositiveString.add(sampleChars[rand(51)])
if bf.lookup(falsePositiveString):
falsePositives += 1
echo("N false positives (of ", nElementsToTest, " lookups): ", falsePositives)
echo("False positive rate ", formatFloat(falsePositives / nElementsToTest,
format = ffDecimal, precision = 4))
var lookupErrors = 0
startTime = cpuTime()
for i in 0..(nElementsToTest - 1):
if not bf.lookup(kTestElements[i]):
lookupErrors += 1
endTime = cpuTime()
echo("Took ", formatFloat(endTime - startTime, format = ffDecimal,
precision = 4), " seconds to lookup ", nElementsToTest, " items.")
echo("N lookup errors (should be 0): ", lookupErrors)
# Finally test correct k / mOverN specification,
# first case raises an error, second works
try:
discard getMOverNBitsForK(k = 2, targetError = 0.00001)
assert false
except BloomFilterError:
assert true
assert getMOverNBitsForK(k = 2, targetError = 0.1) == 6
assert getMOverNBitsForK(k = 7, targetError = 0.01) == 10
assert getMOverNBitsForK(k = 7, targetError = 0.001) == 16
var bf3 = initializeBloomFilter(1000, 0.01, k = 4)
assert bf3.nBitsPerElem == 11