2025-02-11 13:24:43 -08:00
|
|
|
import { hashN } from "./nim_hashn/nim_hashn.mjs";
|
2025-01-27 17:29:27 -08:00
|
|
|
import { getMOverNBitsForK } from "./probabilities.js";
|
|
|
|
|
|
|
|
|
|
export interface BloomFilterOptions {
|
|
|
|
|
// The expected maximum number of elements for which this BloomFilter is sized.
|
|
|
|
|
capacity: number;
|
|
|
|
|
|
|
|
|
|
// The desired false-positive rate (between 0 and 1).
|
|
|
|
|
errorRate: number;
|
|
|
|
|
|
|
|
|
|
// (Optional) The exact number of hash functions, if the user wants to override the automatic calculation.
|
|
|
|
|
kHashes?: number;
|
|
|
|
|
|
|
|
|
|
// (Optional) Force a specific number of bits per element instead of using a table or optimal formula.
|
|
|
|
|
forceNBitsPerElem?: number;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-30 15:33:17 -08:00
|
|
|
const sizeOfInt = 8;
|
|
|
|
|
|
2025-01-27 17:29:27 -08:00
|
|
|
/**
|
|
|
|
|
* A probabilistic data structure that tracks memberships in a set.
|
|
|
|
|
* Supports time and space efficient lookups, but may return false-positives.
|
|
|
|
|
* Can never return false-negatives.
|
|
|
|
|
* A bloom filter can tell us if an element is:
|
|
|
|
|
* - Definitely not in the set
|
|
|
|
|
* - Potentially in the set (with a probability depending on the false-positive rate)
|
|
|
|
|
*/
|
2025-01-30 15:33:17 -08:00
|
|
|
export class BloomFilter {
|
2025-01-27 17:29:27 -08:00
|
|
|
public totalBits: number;
|
2025-01-30 15:33:17 -08:00
|
|
|
public data: Array<bigint> = [];
|
|
|
|
|
public kHashes: number;
|
|
|
|
|
public errorRate: number;
|
2025-01-27 17:29:27 -08:00
|
|
|
|
2025-02-11 13:24:43 -08:00
|
|
|
public options: BloomFilterOptions;
|
|
|
|
|
|
2025-01-30 15:33:17 -08:00
|
|
|
private hashN: (item: string, n: number, maxValue: number) => number;
|
|
|
|
|
public constructor(
|
|
|
|
|
options: BloomFilterOptions,
|
|
|
|
|
hashN: (item: string, n: number, maxValue: number) => number
|
|
|
|
|
) {
|
2025-02-11 13:24:43 -08:00
|
|
|
this.options = options;
|
|
|
|
|
|
2025-01-27 17:29:27 -08:00
|
|
|
let nBitsPerElem: number;
|
|
|
|
|
let k = options.kHashes ?? 0;
|
|
|
|
|
const forceNBitsPerElem = options.forceNBitsPerElem ?? 0;
|
|
|
|
|
|
|
|
|
|
if (k < 1) {
|
|
|
|
|
// Calculate optimal k based on target error rate
|
|
|
|
|
const bitsPerElem = Math.ceil(
|
|
|
|
|
-1.0 * (Math.log(options.errorRate) / Math.pow(Math.log(2), 2))
|
|
|
|
|
);
|
|
|
|
|
k = Math.round(Math.log(2) * bitsPerElem);
|
|
|
|
|
nBitsPerElem = Math.round(bitsPerElem);
|
|
|
|
|
} else {
|
|
|
|
|
// Use specified k if possible
|
|
|
|
|
if (forceNBitsPerElem < 1) {
|
|
|
|
|
// Use lookup table
|
|
|
|
|
nBitsPerElem = getMOverNBitsForK(k, options.errorRate);
|
|
|
|
|
} else {
|
|
|
|
|
nBitsPerElem = forceNBitsPerElem;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const mBits = options.capacity * nBitsPerElem;
|
2025-01-30 15:33:17 -08:00
|
|
|
const mInts = 1 + Math.floor(mBits / (sizeOfInt * 8));
|
2025-01-27 17:29:27 -08:00
|
|
|
|
|
|
|
|
this.totalBits = mBits;
|
2025-01-30 15:33:17 -08:00
|
|
|
this.data = new Array<bigint>(mInts);
|
|
|
|
|
this.data.fill(BigInt(0));
|
|
|
|
|
this.kHashes = k;
|
|
|
|
|
this.hashN = hashN;
|
|
|
|
|
this.errorRate = options.errorRate;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public computeHashes(item: string): number[] {
|
|
|
|
|
const hashes = new Array<number>(this.kHashes);
|
|
|
|
|
for (let i = 0; i < this.kHashes; i++) {
|
|
|
|
|
hashes[i] = this.hashN(item, i, this.totalBits);
|
|
|
|
|
}
|
|
|
|
|
return hashes;
|
2025-01-27 17:29:27 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Adds an item to the bloom filter by computing its hash values
|
|
|
|
|
// and setting corresponding bits in "data".
|
2025-01-30 15:33:17 -08:00
|
|
|
public insert(item: string): void {
|
|
|
|
|
const hashSet = this.computeHashes(item);
|
|
|
|
|
for (const h of hashSet) {
|
|
|
|
|
const intAddress = Math.floor(h / (sizeOfInt * 8));
|
|
|
|
|
const bitOffset = h % (sizeOfInt * 8);
|
|
|
|
|
this.data[intAddress] =
|
|
|
|
|
this.data[intAddress] | (BigInt(1) << BigInt(bitOffset));
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-01-27 17:29:27 -08:00
|
|
|
|
|
|
|
|
// Checks if the item is potentially in the bloom filter.
|
|
|
|
|
// The method is guaranteed to return "true" for items that were inserted,
|
|
|
|
|
// but might also return "true" for items that were never inserted
|
|
|
|
|
// (purpose of false-positive probability).
|
2025-01-30 15:33:17 -08:00
|
|
|
public lookup(item: string): boolean {
|
|
|
|
|
const hashSet = this.computeHashes(item);
|
|
|
|
|
for (const h of hashSet) {
|
|
|
|
|
const intAddress = Math.floor(h / (sizeOfInt * 8));
|
|
|
|
|
const bitOffset = h % (sizeOfInt * 8);
|
|
|
|
|
const currentInt = this.data[intAddress];
|
|
|
|
|
if (currentInt != (currentInt | (BigInt(1) << BigInt(bitOffset)))) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2025-02-11 13:24:43 -08:00
|
|
|
|
|
|
|
|
public toBytes(): Uint8Array {
|
|
|
|
|
const buffer = new ArrayBuffer(this.data.length * 8);
|
|
|
|
|
const view = new DataView(buffer);
|
|
|
|
|
for (let i = 0; i < this.data.length; i++) {
|
|
|
|
|
view.setBigInt64(i * 8, this.data[i]);
|
|
|
|
|
}
|
|
|
|
|
return new Uint8Array(buffer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static fromBytes(
|
|
|
|
|
bytes: Uint8Array,
|
|
|
|
|
options: BloomFilterOptions,
|
|
|
|
|
hashN: (item: string, n: number, maxValue: number) => number
|
|
|
|
|
): BloomFilter {
|
|
|
|
|
const bloomFilter = new BloomFilter(options, hashN);
|
|
|
|
|
const view = new DataView(bytes.buffer);
|
|
|
|
|
for (let i = 0; i < bloomFilter.data.length; i++) {
|
|
|
|
|
bloomFilter.data[i] = view.getBigUint64(i * 8, false);
|
|
|
|
|
}
|
|
|
|
|
return bloomFilter;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export class DefaultBloomFilter extends BloomFilter {
|
|
|
|
|
public constructor(options: BloomFilterOptions) {
|
|
|
|
|
super(options, hashN);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static fromBytes(
|
|
|
|
|
bytes: Uint8Array,
|
|
|
|
|
options: BloomFilterOptions
|
|
|
|
|
): DefaultBloomFilter {
|
|
|
|
|
return BloomFilter.fromBytes(bytes, options, hashN);
|
|
|
|
|
}
|
2025-01-27 17:29:27 -08:00
|
|
|
}
|