nwaku/vendor/nim-unicodedb/gen/min_perfect_hash.nim

79 lines
2.1 KiB
Nim

import math
import algorithm
proc fnv32a*(key: openarray[int], seed: uint32): uint32 =
## Calculates a distinct hash function for a given sequence
## FNV algorithm from http://isthe.com/chongo/tech/comp/fnv/
# XXX this should be 2166136261'u32
result = 18652614'u32 # -> 2166136261 mod int32.high
if seed > 0'u32:
result = seed
for s in key:
result = result xor uint32(s)
result = result * 16777619'u32 # unsigned will wrap around
type
MphValueType = int or seq[int]
proc mphLookup*[T: MphValueType](
hashes: openarray[int],
values: openarray[T],
key: openarray[int]
): T =
assert hashes.len <= int32.high
assert values.len <= int32.high
let d = hashes[int(fnv32a(key, 0'u32) mod hashes.len.uint32)]
result = values[int(fnv32a(key, d.uint32) mod values.len.uint32)]
type
Record*[T: MphValueType] = tuple
key: seq[int]
value: T
proc mph*[T: MphValueType](
data: openarray[Record[T]]
): tuple[h: seq[int], v: seq[T]] =
let dataSize = len(data)
result = (
h: newSeq[int](dataSize),
v: newSeq[T](dataSize))
var filled = newSeq[bool](dataSize)
var buckets = newSeq[seq[Record[T]]](dataSize)
for i in 0 ..< dataSize:
buckets[i] = newSeqOfCap[Record[T]](1)
for record in data:
buckets[int(fnv32a(record.key, 0'u32) mod dataSize.uint32)].add(record)
buckets.sort(
proc (x, y: seq[Record[T]]): int =
result = cmp(len(x), len(y)),
SortOrder.Descending)
for bucket in buckets:
if len(bucket) == 0:
break
# XXX: d should be 0
var d = 1
var item = 0
var slots = newSeqOfCap[int](len(bucket))
# Try values of d until we find a hash function
# that places all items in the bucket's free slots
while item < len(bucket):
let slot = int(fnv32a(bucket[item].key, d.uint32) mod dataSize.uint32)
if filled[slot] or slot in slots:
inc d
item = 0
slots.setLen(0)
else:
inc item
slots.add(slot)
result.h[int(fnv32a(bucket[0].key, 0'u32) mod dataSize.uint32)] = d
for i in 0 ..< len(bucket):
result.v[slots[i]] = bucket[i].value
filled[slots[i]] = true